I’d been planning on watching Solaris for a long time, but having it appear in both this list of free movies and Akira Kurosawa’s 100 favorites was an extra kick in the pants. I preferred to download a file rather than deal with the hiccups of streaming, but the subtitle file I found didn’t mesh well with it. It started way too late, and even after writing a script to do a base correction the discrepancy grew over time. What I’ve got now still has a growing discrepancy (although in the opposite direction), but it’s less than a minute off by the final line, which I suppose isn’t too bad for a movie over two hours long. At any rate, I figured I’d put what I wrote here.

import re

def millisToTimestamp(millis):
	secondsize = 1000
	minutesize = 60*secondsize
	hoursize = 60*minutesize
	milliseconds = millis % secondsize
	millis -= milliseconds
	seconds = millis % minutesize
	millis -= seconds
	seconds /= secondsize
	minutes = millis % hoursize
	millis -= minutes
	minutes /= minutesize
	hours = millis / hoursize
	times = [str(int(x)) for x in [hours,minutes,seconds,milliseconds]]
	times = ['0'*(size-len(string))+string for (string,size) in zip(times,[2,2,2,3])]
	return times[0]+":"+times[1]+":"+times[2]+","+times[3]

def timestampToMillis(timestamp):
	split = timestamp.split(":")
	split[2] = split[2].replace(',','')
	split = [int(x) for x in split]
	split[1] *= 60*1000
	split[0] *= 60*60*1000
	return sum(split)

timePattern = re.compile('(?P\d\d:\d\d:\d\d,\d{3}) --> (?P\d\d:\d\d:\d\d,\d{3})')

def modifyTimestamp(timestamp):
        if '00:00:00,000' in timestamp:
                return timestamp
        #The gap seems to increase over time
        #At the first line of dialogue, the subtitle about 22 seconds late
        #Even after the base correction
        #Need to multiply that factor by the distance from the timestamp
        firststamp = '00:00:17,592'
        lastdialoguestamp = '02:46:40,698'
        #22 seconds now seems slightly too much
        #Instead note that last timestamp should appear at 02:39:23,xxx
        #But subtitle file thinks it's at 02:46:40,698
        baseExcess = timestampToMillis(firststamp)
        lastdialoguemillis = timestampToMillis(lastdialoguestamp)
        lastdialoguedifference = lastdialoguemillis-timestampToMillis('02:39:23,000')
        excessFactor = lastdialoguedifference / (lastdialoguemillis-baseExcess)
        currentMillis = timestampToMillis(timestamp)
        dynamicExcess = excessFactor*(currentMillis-baseExcess)
        finalMillis = int(currentMillis - (dynamicExcess+baseExcess))
        return millisToTimestamp(finalMillis)

def modifyFile(filename):
        infile = open(filename)
        outname = filename + 'edited'
        if '.' in filename:
                #This bit doesn't work as well as I'd like
                split = filename.split('.')
                outname = split[0] + 'edited' + '.' + split[1]
        outfile = open(filename+"edited",'w')
        tosubtract = timestampToMillis(excessStamp)
        for line in infile:
                result = timePattern.match(line)
                        [beginstamp, endstamp] = [result.group(name) for name in  ['beginstamp','endstamp']]
                        [newbegin, newend] = [modifyTimestamp(timestamp) for timestamp in [beginstamp, endstamp]]
                        for (old, new) in zip([beginstamp, endstamp],[newbegin,newend]):
                                line = line.replace(old, new)

if __name__ == '__main__':
        import sys
        args = sys.argv
        if len(args) > 1:
                filename = args[1]