Created
September 5, 2011 07:53
-
-
Save venj/1194349 to your computer and use it in GitHub Desktop.
a small change to original ydDumper.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#This Python file uses the following encoding: utf-8 | |
#Author:Superfan | |
#Version 1.0 | |
#Latest Update:July,31,2011 | |
#Usage:This script will convert youdao dictonary(plus version) to Mdict source file,put it in the same directory which offline glossories exist, and run it with python VM. | |
#This script is for STUDY use ONLY.DO NOT USE IT FOR ANY COMMERCIAL PURPOSE!!! | |
# Small changes by Venj, 2011-09-05 | |
from __future__ import division | |
import string,re,sys,os,time | |
import xml.parsers.expat | |
class xmlParser: | |
def __init__(self,strXML): | |
self.strXML=strXML | |
self.xmlParsed='' | |
self.name='' | |
self.stack=[] | |
self.entryNum=0 | |
self.phrsNum=0 | |
self.lEncounterd=False | |
self.nameBegin=True | |
self.phrs=[] | |
self.dataBuffer='' | |
self.parse() | |
def StartElementHandler(self,name,attrs): | |
#tag checker | |
if name not in ['word','return-phrase','l','i','phone','trs','tr','phr','des','phrs','pos','wfs','wf','exam','f','n','syno','anto']: | |
print name | |
print self.strXML | |
open('tmp.xml','wb').write(self.strXML) | |
raw_input() | |
self.stack.append(name) | |
if attrs!={}: | |
print attrs | |
print strXML | |
raw_input() | |
if name in ['trs','phrs']:self.entryNum=0 | |
if name=='tr': | |
self.lEncounterd=False | |
self.entryNum+=1 | |
self.xmlParsed+='<span class="%s">'%name #write current tag | |
if name=='l': | |
self.lNum=+1 | |
if self.lEncounterd==False and 'return-phrase' not in self.stack and 'trs' in self.stack: | |
self.lEncounterd=True | |
self.xmlParsed+='<span class="entryNum">%d.</span>'%self.entryNum | |
self.xmlParsed+=u'<span class="entryDot">■</span>' | |
if name=='syno': | |
self.xmlParsed+=u'<span class="tongyici">近义词</span>' | |
if name=='anto': | |
self.xmlParsed+=u'<span class="fanyici">反义词</span>' | |
if name=='wf': | |
self.xmlParsed+=u'<span class="bianxing">变形</span>' | |
def EndElementHandler(self,name): | |
if self.dataBuffer!='': | |
if self.stack[-3:]==['phr','l','i']: | |
self.phrsNum+=1 | |
if self.dataBuffer[-1]==';':self.dataBuffer=self.dataBuffer[0:-1] | |
self.phrs.append((self.dataBuffer,self.name,self.phrsNum)) | |
self.xmlParsed+='<a name="phr%d">'%self.phrsNum+self.dataBuffer+'</a>' | |
else:self.xmlParsed+=self.dataBuffer | |
self.dataBuffer='' | |
self.stack.pop() | |
self.xmlParsed+=r'</span>' | |
def CharacterDataHandler(self,data): | |
if 'return-phrase' in self.stack: | |
if re.search(r'^\d+',data)!=None and data!='' and self.nameBegin==False: | |
tmp=re.search(r'(^\d+)(.*)',data) | |
self.name+=' %s'%data | |
self.dataBuffer+='<span class="upper">%s</span>%s'%tmp.groups() | |
return | |
self.nameBegin=False | |
self.name+=data.replace('\n','') | |
if self.stack[-1]=='phone': | |
self.xmlParsed+='['+data+']' | |
return | |
#self.xmlParsed+=data | |
self.dataBuffer+=data | |
def parse(self): | |
p=xml.parsers.expat.ParserCreate('UTF-8') | |
p.StartElementHandler=self.StartElementHandler | |
p.EndElementHandler=self.EndElementHandler | |
p.CharacterDataHandler=self.CharacterDataHandler | |
p.returns_unicode=True | |
p.Parse(self.strXML) | |
def decrypt(str): | |
i=0 | |
tmp='' | |
try: | |
while True: | |
tmp+=chr((255-ord(str[i]))) #XD | |
i+=1 | |
except IndexError:pass | |
return tmp | |
def parse_dict(file): | |
pointer = 0x404 | |
#whether or not to delete entries with the same name as a former one.(for xhy.ydic keep this value for False) | |
delDuplicate=False | |
f=open(file,'rb') | |
buffer='' | |
f.seek(pointer) | |
counter=0 | |
phrs=[] | |
names=[] | |
output_file = file.split(".")[0] + '.txt' | |
output=open(output_file,'wb') | |
while True: | |
if len(buffer)<5*1024*1024: | |
buffer+=decrypt(f.read(10*1024*1024)) | |
if buffer.find(r'</word>')==-1:break | |
a=buffer.find(r'</word>') | |
if a==-1:break | |
else: | |
counter+=1 | |
P=xmlParser(buffer[:a+len(r'</word>')]) | |
buffer=buffer[(a+len(r'</word>')):] | |
if delDuplicate and P.name in names:continue | |
output.write(P.name.encode('UTF-8')+'\x0d\x0a') | |
names.append(P.name) | |
output.write('<link rel="stylesheet" type="text/css" href="sf_ecce.css"/>\x0d\x0a'+P.xmlParsed.encode('UTF-8')+'\x0d\x0a</>\x0d\x0a') | |
phrs+=P.phrs | |
if counter%10==0: | |
print '%d\r'%counter, | |
print 'Done!Total %d entries'%counter | |
print 'Processing references...Total:%d'%len(phrs) | |
counter=0 | |
for (phrName,entry,phrNum) in phrs: | |
counter+=1 | |
if delDuplicate and phrName in names:continue | |
if counter%20==0: | |
print 'Percentage:%%%.2f\r'%(counter/len(phrs)), | |
names.append(phrName) | |
output.write(phrName.encode('utf-8')) | |
output.write('\x0d\x0a<link rel="stylesheet" type="text/css" href="sf_ecce.css"/>\x0d\x0a') | |
output.write(u'<span class="reference"><span class="ref_title">见:</span>'.encode('utf-8')) | |
output.write('<a href="entry://%s#phr%d">%s</a></span>\x0d\x0a</>\x0d\x0a'%(entry.encode('utf-8'),phrNum,phrName.encode('utf-8'))) | |
if len(sys.argv) < 2: | |
print "Usage: %s dict_file_1 [...]" % sys.argv[0] | |
exit(1) | |
sys.argv.pop(0) | |
for f in sys.argv: | |
print "Processing: %s" % f | |
parse_dict(f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment