Skip to content

Instantly share code, notes, and snippets.

@wangtzINT
Last active December 16, 2015 01:08
Show Gist options
  • Save wangtzINT/5352348 to your computer and use it in GitHub Desktop.
Save wangtzINT/5352348 to your computer and use it in GitHub Desktop.
This script can translate txt file into excel sheet (executable version is generated by py2exe)
from distutils.core import setup
import py2exe
#This script is based on py2exe
setup(windows=["translation.py"])
import re
import xlrd
import codecs
import xlwt
from Tkinter import *
import tkFileDialog
import os
def getLineNumber(pos, content):
lineNumber = content.count("\n", 0, pos+1)+1
return lineNumber
pass
def valide(content):
sugar = lambda x: "<" + x + ">"
# list of error messages
output = []
# check of mandatory fields
# according to requirements, all fields are mandatory
mandatoryFields = ["source", "addressA", "addressB", "editor", "time",
"questionNum", "memo", "inputTime",
"text0", "guide0"]
for field in mandatoryFields:
if "<" + field + ">" not in content:
output.append("{0} field is mandatory but missed.".format(sugar(field)))
# check of pair match
pairs = [["text0", "text1"], ["guide0", "guide1"]]
# add pairs
# TODO: bug: detection of *pair0*removed pair1
for i in xrange(1, 10000):
questionTitle = "{0}question0".format(i)
if sugar(questionTitle) in content:
pairs.append(["{0}question0".format(i), "{0}question1".format(i)])
pass
else:
break
# if pair0 exists, while pair1 not -> problem
for pair in pairs:
posPair0 = content.find(sugar(pair[0]), 0)
if posPair0 > 0:
posPair1 = content.find(sugar(pair[1]), posPair0+1) # prevent overlapping
if posPair1 == -1:
lineNumber = getLineNumber(posPair0, content)
output.append("line {0}, operator {1} is not closed."
.format(lineNumber, sugar(pair[0])))
pass
if output != []:
return "\n".join(output)
pass
def processFile(filepath, filename):
book = xlwt.Workbook()
sheet1 = book.add_sheet('Sheet 1')
# font
font0 = xlwt.Font()
font0.name = 'Times New Roman'
style0 = xlwt.XFStyle()
style0.font = font0
"""
test Failed
Simplified A office can not be launched...
# convert word file to plain text file
word = client.Dispatch('Word.Application')
doc = word.Documents.Open(filename)
doc.SaveAs(filename + '.txt', 7)
doc.Close()
"""
# Python need to know the encoding to read files
# mbcs corresponds to ANSI in windows
# utf-8 is UTF-8 in windows
f = codecs.open(filepath, "r", "mbcs")
content = f.read()
# re.S dot match all
# re.VERBOSE comments on regular expression
articles = re.finditer(r"""
\#< # starter
(.*?) # all content, non-greedy
\#> # terminator
""", content, re.S|re.VERBOSE)
output = []
# valide article sections
pairs = ["#<", "#>"]
lastPos = 0
while True:
pos0 = content.find(pairs[0], lastPos)
if pos0 == -1:
# there is no more left op
pos1 = content.find(pairs[1], lastPos)
if pos1 != -1:
# still right op => left op is missing
lineNumber = getLineNumber(pos1, content)
return (0, "line {0}, #< is missing.".format(lineNumber))
else:
# no more right op => end of handle
break
break
else:
# still left op
pos1 = content.find(pairs[1], lastPos + len(pairs[0]))
if pos1 != -1:
# still right op
# make sure there is no right op before left op
if pos1 > pos0:
# make a pair
lastPos = pos1 + len(pairs[1])
else:
lineNumber = getLineNumber(pos1, content)
return (0, "line {0}, #< is missing.".format(lineNumber))
else:
# no more right op => right op is missing
lineNumber = getLineNumber(pos0, content)
return (0, "line {0}, #< is not closed.".format(lineNumber))
pass
# article information starts from line 1
articleIdx = 0
isNoArticle = True
for article in articles:
articleIdx = articleIdx + 1
isNoArticle = False
# terrible!
global noErrorFound
noErrorFound = True
articleContent = article.group(1)
# for each article
# define wrapers
# TODO: noErrorFound why not discloure
def getFirstLineContent(s, content):
global noErrorFound
if "\n" in content:
pos = re.search(r"\n", content).start()
contentOfFirstLine = content[:pos]
contentRightStrip = content.rstrip()
if contentRightStrip != contentOfFirstLine.rstrip() or contentOfFirstLine.strip() == "":
if noErrorFound:
output.append("** In article {0}: ".format(articleIdx))
noErrorFound = False
pass
if contentRightStrip != contentOfFirstLine.rstrip():
output.append(u"line {0}, only the first line of text in <{1}> has been captured".
format(getLineNumber(re.search("<{0}>".format(s), articleContent).start(),
articleContent),
s))
# content.strip() == "" has generate its error message before
if content.strip() != "" and \
contentOfFirstLine.strip() == "":
output.append(u"line {0}, <{1}> is empty".
format(getLineNumber(re.search("<{0}>".format(s), articleContent).start(),
articleContent),
s))
pass
content = content[:pos]
return content
pass
validation = valide(articleContent)
if validation is not None:
if noErrorFound:
output.append("** In article {0}: ".format(articleIdx))
noErrorFound = False
output.append("\n".join(validation.strip().split("\n")))
rawSections = re.finditer(r"""
<(?P<title>
[\ \t]*\w*[\ \t]* # match title
)>
(?P<content>
.*? # match content, non-greedy
)
(?= # possitive lookahead (not counted for next iter
(?P<end>
<[\ \t]*\w*[\ \t]*>| # next title
# \#>| # or article teminitor (removed, cause it's not in the section)
$ # or end of section
)
)
""", articleContent, re.S|re.VERBOSE)
exsitingSectionDict = dict()
for rawSection in rawSections:
# check for nested marker
titleLabel = rawSection.group("title")
# labels larger than 10 will be discarded
if len(titleLabel) > 100 :
continue
if titleLabel != titleLabel.strip():
if noErrorFound:
output.append("** In article {0}: ".format(articleIdx))
noErrorFound = False
pass
output.append(u"line {0}, <{1}> is not valid, do you mean <{2}>?".
format(getLineNumber(rawSection.start("title"),
articleContent),
titleLabel, titleLabel.strip()))
# no valid section, skip following processing
continue
endLabel = rawSection.group("end")[1:-1]
if titleLabel[-1:] == "0" and titleLabel[:-1] != endLabel[:-1]:
if noErrorFound:
output.append("** In article {0}: ".format(articleIdx))
noErrorFound = False
pass
output.append(u"line {0} - {1}, unexpected nested labels (plz check <{2}> and <{3}>)".
format(getLineNumber(rawSection.start("title"),
articleContent),
getLineNumber(rawSection.start("end"),
articleContent),
titleLabel,
endLabel))
pass
# avoid empty section
if titleLabel[-1:] != "1" and \
rawSection.group("content").strip() == "":
if noErrorFound:
output.append("** In article {0}: ".format(articleIdx))
noErrorFound = False
pass
output.append(u"line {0}, <{1}> is empty".
format(getLineNumber(rawSection.start("title"),
articleContent),
titleLabel))
pass
# avoid duplicated marker
if titleLabel not in exsitingSectionDict:
# normal
exsitingSectionDict[titleLabel] = rawSection.group("content")
else:
# duplicate
if noErrorFound:
output.append("** In article {0}: ".format(articleIdx))
noErrorFound = False
pass
output.append(u"line {0}, <{1}> is not unique".
format(getLineNumber(rawSection.start("title"),
articleContent),
titleLabel))
pass
pass
# a sample transformation of sections
# remove Nquestion1, text1, guide1, etc
col = 0
PreRequiredSections = [ "source", "addressA", "addressB",
"editor", "time", "inputTime",
"questionNum", "memo",
"text0", "guide0"]
questionNum = None
for s in PreRequiredSections:
if articleIdx == 1:
# In output "text0" should be "text", etc
newIdxName = s
if s[-1] == "0":
newIdxName = s[:-1]
sheet1.write(0, col, newIdxName, style0)
# it's possible that s section is not included in input file
# in this case, use " " to prevent text overlapping in xls
content = " "
if s in exsitingSectionDict:
content = exsitingSectionDict[s]
# single line maker can have only one line of content
if s[-1] != "0":
content = getFirstLineContent(s, content)
if s == "questionNum" and s in exsitingSectionDict:
if content.strip().isdigit():
questionNum = int(content.strip())
else:
if noErrorFound:
output.append("** In article {0}: ".format(articleIdx))
noErrorFound = False
pass
output.append(u"line {0}, <{1}> {2} is not a digital number".
format(getLineNumber(re.search("<{0}>".format(s), articleContent).start(),
articleContent),
s, content))
questionNum = content
sheet1.write(articleIdx, col, content, style0)
col += 1
# At the end, remove used items
# it's possible that some fields are not filled
if s in exsitingSectionDict:
del exsitingSectionDict[s]
# expanding Nquestion0, answerN, noteN
# xrange.max will not be reached
# N < max(range)
for i in xrange(1,
(isinstance(questionNum, int) and [questionNum+1] or [10000])[0]):
questionTitle = "{0}question0".format(i)
questionTitleEnd = "{0}question1".format(i)
preAnswer = "{0}preAnswer".format(i)
answerTitle = "{0}answer".format(i)
noteTitle = "{0}note".format(i)
# If any field exists, the whole block exists
if questionTitle in exsitingSectionDict or \
preAnswer in exsitingSectionDict or \
answerTitle in exsitingSectionDict or \
noteTitle in exsitingSectionDict:
errList = []
try:
# prevent overwriting exception
sheet1.write(0, col, questionTitle[:-1], style0)
sheet1.write(0, col+1, preAnswer, style0)
sheet1.write(0, col+2, answerTitle, style0)
sheet1.write(0, col+3, noteTitle, style0)
except:
pass
# for some sections only the first line is needed
f1 = lambda s, content: (s[-1] != "0" and s[-4:] != "note") \
and getFirstLineContent(s, content) \
or content
# define a wraper, display the content
# otherwise " " to avoid text overlap
f = lambda key, collection: key in collection \
and f1(key, collection[key]) or " "
# skip leading N. by programming (skip leading \S%d.\S
rawQuestionContent = f(questionTitle, exsitingSectionDict)
realQuestionContentStartMatch = re.search("^\s*\d+\.\s*", rawQuestionContent)
realQuestion = rawQuestionContent
if realQuestionContentStartMatch:
realQuestion = rawQuestionContent[realQuestionContentStartMatch.end():]
sheet1.write(articleIdx, col, realQuestion, style0)
sheet1.write(articleIdx, col+1, f(preAnswer, exsitingSectionDict), style0)
sheet1.write(articleIdx, col+2, f(answerTitle, exsitingSectionDict), style0)
sheet1.write(articleIdx, col+3, f(noteTitle, exsitingSectionDict), style0)
col += 4
try:
del exsitingSectionDict[questionTitle]
del exsitingSectionDict[questionTitleEnd]
except:
errList.append(questionTitle)
pass
try:
del exsitingSectionDict[preAnswer]
except:
errList.append(preAnswer)
pass
try:
del exsitingSectionDict[answerTitle]
except:
errList.append(answerTitle)
pass
try:
del exsitingSectionDict[noteTitle]
except:
errList.append(noteTitle)
pass
for err in errList:
if noErrorFound:
output.append("** In article {0}: ".format(articleIdx))
noErrorFound = False
pass
output.append(u"<{0}> field is mandatory but missed.".
format(err))
else:
if i-1 != questionNum:
if noErrorFound:
output.append("** In article {0}: ".format(articleIdx))
noErrorFound = False
pass
output.append(u"{0} questions have been processed, which mismatch with declaration questionNum = {1}.".
format(i-1, questionNum))
break
# check if there is unknown marker
# unknown bug: unknown marker duddud0 will not be reported..
for key, value in exsitingSectionDict.items():
if key == "text1" or key == "guide1":
# marker ended with 1 is a terminator which has not been removed from dict
continue
if noErrorFound:
output.append("** In article {0}: ".format(articleIdx))
noErrorFound = False
pass
output.append(u"line {0}, unknown marker <{1}> found".
format(getLineNumber(re.search("<{0}>".format(key), articleContent).start(),
articleContent),
key))
pass
if isNoArticle:
return (articleIdx, "No article found!")
# filename[:-4] skip ending .txt
# Chinese file name require unicode file name
book.save(u".\output\{0}.xls".format(filename[:-4]))
if output == []:
return (articleIdx, True)
else:
return (articleIdx, "\n".join(output))
pass
def processAFile(txt):
def main():
# ask for txt file location
filepath = tkFileDialog.askopenfilename(initialdir = '.', filetypes=[("Text file","*.txt")])
head, tail = os.path.split(filepath)
filename = tail
# decode to mbcs to display chinese
# .decode("mbcs") is not needed with help of tkFileDialog.askopenfilename
txt.insert(END, '=' * 5 + " " + filename + " " + '=' * 5 + '\n')
txt.insert(END, "Start processing" + '\n')
txt.yview(END)
countArticles, output = processFile(filepath, filename)
txt.insert(END, '{0} article(s) have been processed'.format(countArticles) + '\n')
errorFileName = "./output/" + filename[:-4] + " - error.txt"
logFileName = "./output/" + filename[:-4] + " - log.txt"
if output is True:
txt.insert(END, 'Successed' + '\n')
if os.path.exists(errorFileName):
# remove error file
os.unlink(errorFileName)
fobj = open(logFileName, 'w')
fobj.write('{0} article(s) have been processed'.format(countArticles) + '\n')
fobj.close()
else:
if os.path.exists(logFileName):
# remove log file
os.unlink(logFileName)
txt.insert(END, 'Error:' + '\n')
errorContent = "\t" + "\n\t".join(output.split("\n")) + '\n'
txt.insert(END, errorContent)
# create or rewrite error file
fobj = open(errorFileName, 'w')
fobj.write('{0} article(s) have been processed'.format(countArticles) + '\n')
fobj.write(output)
fobj.close()
txt.yview(END)
pass
pass
return main
if __name__ == "__main__":
root = Tk() # create a root window
root.geometry('500x200')
scrollbar = Scrollbar(root)
scrollbar.pack(side=RIGHT, fill=Y)
label_line = Frame(root)
label_line.pack(side=TOP, padx=1, pady=1)
add_button = Button(label_line,
text='process a new file')
add_button.pack()
txt = Text(root,
width=100,
borderwidth=2,
)
txt.pack(side=LEFT, fill=BOTH)
scrollbar.config(command=txt.yview)
txt['yscrollcommand'] = scrollbar.set
add_button['command'] = processAFile(txt)
root.mainloop() # create an event loop
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment