Last active
December 16, 2015 01:08
-
-
Save wangtzINT/5352348 to your computer and use it in GitHub Desktop.
This script can translate txt file into excel sheet (executable version is generated by py2exe)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from distutils.core import setup | |
import py2exe | |
#This script is based on py2exe | |
setup(windows=["translation.py"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import xlrd | |
import codecs | |
import xlwt | |
from Tkinter import * | |
import tkFileDialog | |
import os | |
def getLineNumber(pos, content): | |
lineNumber = content.count("\n", 0, pos+1)+1 | |
return lineNumber | |
pass | |
def valide(content): | |
sugar = lambda x: "<" + x + ">" | |
# list of error messages | |
output = [] | |
# check of mandatory fields | |
# according to requirements, all fields are mandatory | |
mandatoryFields = ["source", "addressA", "addressB", "editor", "time", | |
"questionNum", "memo", "inputTime", | |
"text0", "guide0"] | |
for field in mandatoryFields: | |
if "<" + field + ">" not in content: | |
output.append("{0} field is mandatory but missed.".format(sugar(field))) | |
# check of pair match | |
pairs = [["text0", "text1"], ["guide0", "guide1"]] | |
# add pairs | |
# TODO: bug: detection of *pair0*removed pair1 | |
for i in xrange(1, 10000): | |
questionTitle = "{0}question0".format(i) | |
if sugar(questionTitle) in content: | |
pairs.append(["{0}question0".format(i), "{0}question1".format(i)]) | |
pass | |
else: | |
break | |
# if pair0 exists, while pair1 not -> problem | |
for pair in pairs: | |
posPair0 = content.find(sugar(pair[0]), 0) | |
if posPair0 > 0: | |
posPair1 = content.find(sugar(pair[1]), posPair0+1) # prevent overlapping | |
if posPair1 == -1: | |
lineNumber = getLineNumber(posPair0, content) | |
output.append("line {0}, operator {1} is not closed." | |
.format(lineNumber, sugar(pair[0]))) | |
pass | |
if output != []: | |
return "\n".join(output) | |
pass | |
def processFile(filepath, filename): | |
book = xlwt.Workbook() | |
sheet1 = book.add_sheet('Sheet 1') | |
# font | |
font0 = xlwt.Font() | |
font0.name = 'Times New Roman' | |
style0 = xlwt.XFStyle() | |
style0.font = font0 | |
""" | |
test Failed | |
Simplified A office can not be launched... | |
# convert word file to plain text file | |
word = client.Dispatch('Word.Application') | |
doc = word.Documents.Open(filename) | |
doc.SaveAs(filename + '.txt', 7) | |
doc.Close() | |
""" | |
# Python need to know the encoding to read files | |
# mbcs corresponds to ANSI in windows | |
# utf-8 is UTF-8 in windows | |
f = codecs.open(filepath, "r", "mbcs") | |
content = f.read() | |
# re.S dot match all | |
# re.VERBOSE comments on regular expression | |
articles = re.finditer(r""" | |
\#< # starter | |
(.*?) # all content, non-greedy | |
\#> # terminator | |
""", content, re.S|re.VERBOSE) | |
output = [] | |
# valide article sections | |
pairs = ["#<", "#>"] | |
lastPos = 0 | |
while True: | |
pos0 = content.find(pairs[0], lastPos) | |
if pos0 == -1: | |
# there is no more left op | |
pos1 = content.find(pairs[1], lastPos) | |
if pos1 != -1: | |
# still right op => left op is missing | |
lineNumber = getLineNumber(pos1, content) | |
return (0, "line {0}, #< is missing.".format(lineNumber)) | |
else: | |
# no more right op => end of handle | |
break | |
break | |
else: | |
# still left op | |
pos1 = content.find(pairs[1], lastPos + len(pairs[0])) | |
if pos1 != -1: | |
# still right op | |
# make sure there is no right op before left op | |
if pos1 > pos0: | |
# make a pair | |
lastPos = pos1 + len(pairs[1]) | |
else: | |
lineNumber = getLineNumber(pos1, content) | |
return (0, "line {0}, #< is missing.".format(lineNumber)) | |
else: | |
# no more right op => right op is missing | |
lineNumber = getLineNumber(pos0, content) | |
return (0, "line {0}, #< is not closed.".format(lineNumber)) | |
pass | |
# article information starts from line 1 | |
articleIdx = 0 | |
isNoArticle = True | |
for article in articles: | |
articleIdx = articleIdx + 1 | |
isNoArticle = False | |
# terrible! | |
global noErrorFound | |
noErrorFound = True | |
articleContent = article.group(1) | |
# for each article | |
# define wrapers | |
# TODO: noErrorFound why not discloure | |
def getFirstLineContent(s, content): | |
global noErrorFound | |
if "\n" in content: | |
pos = re.search(r"\n", content).start() | |
contentOfFirstLine = content[:pos] | |
contentRightStrip = content.rstrip() | |
if contentRightStrip != contentOfFirstLine.rstrip() or contentOfFirstLine.strip() == "": | |
if noErrorFound: | |
output.append("** In article {0}: ".format(articleIdx)) | |
noErrorFound = False | |
pass | |
if contentRightStrip != contentOfFirstLine.rstrip(): | |
output.append(u"line {0}, only the first line of text in <{1}> has been captured". | |
format(getLineNumber(re.search("<{0}>".format(s), articleContent).start(), | |
articleContent), | |
s)) | |
# content.strip() == "" has generate its error message before | |
if content.strip() != "" and \ | |
contentOfFirstLine.strip() == "": | |
output.append(u"line {0}, <{1}> is empty". | |
format(getLineNumber(re.search("<{0}>".format(s), articleContent).start(), | |
articleContent), | |
s)) | |
pass | |
content = content[:pos] | |
return content | |
pass | |
validation = valide(articleContent) | |
if validation is not None: | |
if noErrorFound: | |
output.append("** In article {0}: ".format(articleIdx)) | |
noErrorFound = False | |
output.append("\n".join(validation.strip().split("\n"))) | |
rawSections = re.finditer(r""" | |
<(?P<title> | |
[\ \t]*\w*[\ \t]* # match title | |
)> | |
(?P<content> | |
.*? # match content, non-greedy | |
) | |
(?= # possitive lookahead (not counted for next iter | |
(?P<end> | |
<[\ \t]*\w*[\ \t]*>| # next title | |
# \#>| # or article teminitor (removed, cause it's not in the section) | |
$ # or end of section | |
) | |
) | |
""", articleContent, re.S|re.VERBOSE) | |
exsitingSectionDict = dict() | |
for rawSection in rawSections: | |
# check for nested marker | |
titleLabel = rawSection.group("title") | |
# labels larger than 10 will be discarded | |
if len(titleLabel) > 100 : | |
continue | |
if titleLabel != titleLabel.strip(): | |
if noErrorFound: | |
output.append("** In article {0}: ".format(articleIdx)) | |
noErrorFound = False | |
pass | |
output.append(u"line {0}, <{1}> is not valid, do you mean <{2}>?". | |
format(getLineNumber(rawSection.start("title"), | |
articleContent), | |
titleLabel, titleLabel.strip())) | |
# no valid section, skip following processing | |
continue | |
endLabel = rawSection.group("end")[1:-1] | |
if titleLabel[-1:] == "0" and titleLabel[:-1] != endLabel[:-1]: | |
if noErrorFound: | |
output.append("** In article {0}: ".format(articleIdx)) | |
noErrorFound = False | |
pass | |
output.append(u"line {0} - {1}, unexpected nested labels (plz check <{2}> and <{3}>)". | |
format(getLineNumber(rawSection.start("title"), | |
articleContent), | |
getLineNumber(rawSection.start("end"), | |
articleContent), | |
titleLabel, | |
endLabel)) | |
pass | |
# avoid empty section | |
if titleLabel[-1:] != "1" and \ | |
rawSection.group("content").strip() == "": | |
if noErrorFound: | |
output.append("** In article {0}: ".format(articleIdx)) | |
noErrorFound = False | |
pass | |
output.append(u"line {0}, <{1}> is empty". | |
format(getLineNumber(rawSection.start("title"), | |
articleContent), | |
titleLabel)) | |
pass | |
# avoid duplicated marker | |
if titleLabel not in exsitingSectionDict: | |
# normal | |
exsitingSectionDict[titleLabel] = rawSection.group("content") | |
else: | |
# duplicate | |
if noErrorFound: | |
output.append("** In article {0}: ".format(articleIdx)) | |
noErrorFound = False | |
pass | |
output.append(u"line {0}, <{1}> is not unique". | |
format(getLineNumber(rawSection.start("title"), | |
articleContent), | |
titleLabel)) | |
pass | |
pass | |
# a sample transformation of sections | |
# remove Nquestion1, text1, guide1, etc | |
col = 0 | |
PreRequiredSections = [ "source", "addressA", "addressB", | |
"editor", "time", "inputTime", | |
"questionNum", "memo", | |
"text0", "guide0"] | |
questionNum = None | |
for s in PreRequiredSections: | |
if articleIdx == 1: | |
# In output "text0" should be "text", etc | |
newIdxName = s | |
if s[-1] == "0": | |
newIdxName = s[:-1] | |
sheet1.write(0, col, newIdxName, style0) | |
# it's possible that s section is not included in input file | |
# in this case, use " " to prevent text overlapping in xls | |
content = " " | |
if s in exsitingSectionDict: | |
content = exsitingSectionDict[s] | |
# single line maker can have only one line of content | |
if s[-1] != "0": | |
content = getFirstLineContent(s, content) | |
if s == "questionNum" and s in exsitingSectionDict: | |
if content.strip().isdigit(): | |
questionNum = int(content.strip()) | |
else: | |
if noErrorFound: | |
output.append("** In article {0}: ".format(articleIdx)) | |
noErrorFound = False | |
pass | |
output.append(u"line {0}, <{1}> {2} is not a digital number". | |
format(getLineNumber(re.search("<{0}>".format(s), articleContent).start(), | |
articleContent), | |
s, content)) | |
questionNum = content | |
sheet1.write(articleIdx, col, content, style0) | |
col += 1 | |
# At the end, remove used items | |
# it's possible that some fields are not filled | |
if s in exsitingSectionDict: | |
del exsitingSectionDict[s] | |
# expanding Nquestion0, answerN, noteN | |
# xrange.max will not be reached | |
# N < max(range) | |
for i in xrange(1, | |
(isinstance(questionNum, int) and [questionNum+1] or [10000])[0]): | |
questionTitle = "{0}question0".format(i) | |
questionTitleEnd = "{0}question1".format(i) | |
preAnswer = "{0}preAnswer".format(i) | |
answerTitle = "{0}answer".format(i) | |
noteTitle = "{0}note".format(i) | |
# If any field exists, the whole block exists | |
if questionTitle in exsitingSectionDict or \ | |
preAnswer in exsitingSectionDict or \ | |
answerTitle in exsitingSectionDict or \ | |
noteTitle in exsitingSectionDict: | |
errList = [] | |
try: | |
# prevent overwriting exception | |
sheet1.write(0, col, questionTitle[:-1], style0) | |
sheet1.write(0, col+1, preAnswer, style0) | |
sheet1.write(0, col+2, answerTitle, style0) | |
sheet1.write(0, col+3, noteTitle, style0) | |
except: | |
pass | |
# for some sections only the first line is needed | |
f1 = lambda s, content: (s[-1] != "0" and s[-4:] != "note") \ | |
and getFirstLineContent(s, content) \ | |
or content | |
# define a wraper, display the content | |
# otherwise " " to avoid text overlap | |
f = lambda key, collection: key in collection \ | |
and f1(key, collection[key]) or " " | |
# skip leading N. by programming (skip leading \S%d.\S | |
rawQuestionContent = f(questionTitle, exsitingSectionDict) | |
realQuestionContentStartMatch = re.search("^\s*\d+\.\s*", rawQuestionContent) | |
realQuestion = rawQuestionContent | |
if realQuestionContentStartMatch: | |
realQuestion = rawQuestionContent[realQuestionContentStartMatch.end():] | |
sheet1.write(articleIdx, col, realQuestion, style0) | |
sheet1.write(articleIdx, col+1, f(preAnswer, exsitingSectionDict), style0) | |
sheet1.write(articleIdx, col+2, f(answerTitle, exsitingSectionDict), style0) | |
sheet1.write(articleIdx, col+3, f(noteTitle, exsitingSectionDict), style0) | |
col += 4 | |
try: | |
del exsitingSectionDict[questionTitle] | |
del exsitingSectionDict[questionTitleEnd] | |
except: | |
errList.append(questionTitle) | |
pass | |
try: | |
del exsitingSectionDict[preAnswer] | |
except: | |
errList.append(preAnswer) | |
pass | |
try: | |
del exsitingSectionDict[answerTitle] | |
except: | |
errList.append(answerTitle) | |
pass | |
try: | |
del exsitingSectionDict[noteTitle] | |
except: | |
errList.append(noteTitle) | |
pass | |
for err in errList: | |
if noErrorFound: | |
output.append("** In article {0}: ".format(articleIdx)) | |
noErrorFound = False | |
pass | |
output.append(u"<{0}> field is mandatory but missed.". | |
format(err)) | |
else: | |
if i-1 != questionNum: | |
if noErrorFound: | |
output.append("** In article {0}: ".format(articleIdx)) | |
noErrorFound = False | |
pass | |
output.append(u"{0} questions have been processed, which mismatch with declaration questionNum = {1}.". | |
format(i-1, questionNum)) | |
break | |
# check if there is unknown marker | |
# unknown bug: unknown marker duddud0 will not be reported.. | |
for key, value in exsitingSectionDict.items(): | |
if key == "text1" or key == "guide1": | |
# marker ended with 1 is a terminator which has not been removed from dict | |
continue | |
if noErrorFound: | |
output.append("** In article {0}: ".format(articleIdx)) | |
noErrorFound = False | |
pass | |
output.append(u"line {0}, unknown marker <{1}> found". | |
format(getLineNumber(re.search("<{0}>".format(key), articleContent).start(), | |
articleContent), | |
key)) | |
pass | |
if isNoArticle: | |
return (articleIdx, "No article found!") | |
# filename[:-4] skip ending .txt | |
# Chinese file name require unicode file name | |
book.save(u".\output\{0}.xls".format(filename[:-4])) | |
if output == []: | |
return (articleIdx, True) | |
else: | |
return (articleIdx, "\n".join(output)) | |
pass | |
def processAFile(txt): | |
def main(): | |
# ask for txt file location | |
filepath = tkFileDialog.askopenfilename(initialdir = '.', filetypes=[("Text file","*.txt")]) | |
head, tail = os.path.split(filepath) | |
filename = tail | |
# decode to mbcs to display chinese | |
# .decode("mbcs") is not needed with help of tkFileDialog.askopenfilename | |
txt.insert(END, '=' * 5 + " " + filename + " " + '=' * 5 + '\n') | |
txt.insert(END, "Start processing" + '\n') | |
txt.yview(END) | |
countArticles, output = processFile(filepath, filename) | |
txt.insert(END, '{0} article(s) have been processed'.format(countArticles) + '\n') | |
errorFileName = "./output/" + filename[:-4] + " - error.txt" | |
logFileName = "./output/" + filename[:-4] + " - log.txt" | |
if output is True: | |
txt.insert(END, 'Successed' + '\n') | |
if os.path.exists(errorFileName): | |
# remove error file | |
os.unlink(errorFileName) | |
fobj = open(logFileName, 'w') | |
fobj.write('{0} article(s) have been processed'.format(countArticles) + '\n') | |
fobj.close() | |
else: | |
if os.path.exists(logFileName): | |
# remove log file | |
os.unlink(logFileName) | |
txt.insert(END, 'Error:' + '\n') | |
errorContent = "\t" + "\n\t".join(output.split("\n")) + '\n' | |
txt.insert(END, errorContent) | |
# create or rewrite error file | |
fobj = open(errorFileName, 'w') | |
fobj.write('{0} article(s) have been processed'.format(countArticles) + '\n') | |
fobj.write(output) | |
fobj.close() | |
txt.yview(END) | |
pass | |
pass | |
return main | |
if __name__ == "__main__": | |
root = Tk() # create a root window | |
root.geometry('500x200') | |
scrollbar = Scrollbar(root) | |
scrollbar.pack(side=RIGHT, fill=Y) | |
label_line = Frame(root) | |
label_line.pack(side=TOP, padx=1, pady=1) | |
add_button = Button(label_line, | |
text='process a new file') | |
add_button.pack() | |
txt = Text(root, | |
width=100, | |
borderwidth=2, | |
) | |
txt.pack(side=LEFT, fill=BOTH) | |
scrollbar.config(command=txt.yview) | |
txt['yscrollcommand'] = scrollbar.set | |
add_button['command'] = processAFile(txt) | |
root.mainloop() # create an event loop | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment