Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
mixed (English, Chinese) subtitle generation. Details: http://www.hanxiaogang.com/?p=38201
#-*- coding:gbk-*-
import nltk
import os
def readCorpus():
d = {}
corpus = open('corpus/ldc_ec_dict.2.0.txt')
lines = corpus.readlines()
for line in lines:
eng, chn = line.split('/')[:2]
d[eng.strip().lower()] = chn
return d
def writeSubtitle(d, infile, outfile):
fknown = open('corpus/known.txt')
knownwords = fknown.read().split(';')
fknown = open('corpus/knownVOA.txt')
knownwords += fknown.read().split('\n')
fknown = open('corpus/known3000.txt')
knownwords += fknown.read().split('\n')
flocal = open('corpus/local.txt')
knownwords += flocal.read().split('\n')
knownwords = [word.lower() for word in knownwords]
fwrite = open(outfile, 'w')
fread = open('input/'+infile)
lines = fread.readlines()
stemmer = nltk.PorterStemmer()
for line in lines:
line = line.strip()
if line.isdigit() or line.find('-->')>=0 or len(line)==0:
fwrite.write(line+'\n')
else:
# a subtitle line
words = nltk.word_tokenize(line)
s = ''
for word in words:
if len(word)>3 and word.isalpha():
lower_word = word.lower()
stemmed_word = stemmer.stem(lower_word)
if lower_word not in knownwords and stemmed_word not in knownwords:
#check its chinese
chinese = 'NIL'
if lower_word in d:
chinese = d[lower_word]
elif stemmed_word in d:
chinese = d[stemmed_word]
s += ' ' + word + '(' + chinese + ')'
else:
s += ' ' + word
else:
if word in ["n't", '.', ',', '?', ':'] or word.startswith("'"):
s += word
else:
s += ' ' + word
fwrite.write(s+'һ\n')
if __name__ == '__main__':
d = readCorpus()
for infile in os.listdir("./input"):
print infile
#end = infile.find('S02E')
#outfile = infile[:end+6]+'.srt'
writeSubtitle(d, infile, infile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment