Created
December 7, 2015 03:20
-
-
Save jackycute/d6600792d6ec40792cd3 to your computer and use it in GitHub Desktop.
biglaw_splitter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
#coding=utf8 | |
import os | |
import re | |
import sys | |
import pymysql | |
#=============================================== | |
# normalize input string to prevent from errors | |
#=============================================== | |
def str_normalize(str): | |
try: | |
res = '' | |
if str is not None: | |
strlist = str.split('\'') | |
cnt = 0 | |
for value in strlist: | |
if(cnt > 0): | |
res += '\\\'' | |
cnt += 1 | |
res += value | |
except: | |
print ("E001") | |
sys.exit('E001') | |
return res | |
#=============================================== | |
# Put many regular expression in proper order | |
#=============================================== | |
def get_regexlist(): | |
try: | |
regexlist = [] | |
regexlist.append('.*\n(?P<Location>\S+)-\S+-(?P<Caseyear>\d+)(?P<Character>\D+)(?P<Number>\d+)\n.*\n.*\s--\s(?P<Type>\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P<Judgedate>\d+).*\n\S裁判案由\S\n(?P<Judgereason>(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P<Judge>[^ \s]+)[ \s]+.*號\n(?P<Premain>(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P<Main>(.*\n)*)(?P<Fact1>[\s ]+事[ \s]+實.*)\n(?P<Fact1Word>(.*\n)*)(?P<Fact2>[ \s]*理[ \s]+由.*)\n(?P<Fact2Word>(.*\n)*?)(?P<Tail>中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*') | |
regexlist.append('.*\n(?P<Location>\S+)-\S+-(?P<Caseyear>\d+)(?P<Character>\D+)(?P<Number>\d+)\n.*\n.*\s--\s(?P<Type>\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P<Judgedate>\d+).*\n\S裁判案由\S\n(?P<Judgereason>(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P<Judge>[^ \s]+)[ \s]+.*號\n(?P<Premain>(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P<Main>(.*\n)*)(?P<Fact1>[ \s]*事[ \s]*實及理[ \s]*由.*)\n(?P<Fact1Word>(.*\n)*?)(?P<Tail>中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*') | |
regexlist.append('.*\n(?P<Location>\S+)-\S+-(?P<Caseyear>\d+)(?P<Character>\D+)(?P<Number>\d+)\n.*\n.*\s--\s(?P<Type>\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P<Judgedate>\d+).*\n\S裁判案由\S\n(?P<Judgereason>(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P<Judge>[^ \s]+)[ \s]+.*號\n(?P<Premain>(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P<Main>(.*\n)*)(?P<Fact1>[\s ]+事[ \s]+實.*)\n(?P<Fact1Word>(.*\n)*?)(?P<Tail>中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*') | |
regexlist.append('.*\n(?P<Location>\S+)-\S+-(?P<Caseyear>\d+)(?P<Character>\D+)(?P<Number>\d+)\n.*\n.*\s--\s(?P<Type>\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P<Judgedate>\d+).*\n\S裁判案由\S\n(?P<Judgereason>(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P<Judge>[^ \s]+)[ \s]+.*號\n(?P<Premain>(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P<Main>(.*\n)*)(?P<Fact1>[ \s]*犯罪事實[ \s]*及[ \s]*理由.*)\n(?P<Fact1Word>(.*\n)*?)(?P<Tail>中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*') | |
regexlist.append('.*\n(?P<Location>\S+)-\S+-(?P<Caseyear>\d+)(?P<Character>\D+)(?P<Number>\d+)\n.*\n.*\s--\s(?P<Type>\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P<Judgedate>\d+).*\n\S裁判案由\S\n(?P<Judgereason>(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P<Judge>[^ \s]+)[ \s]+.*號\n(?P<Premain>(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P<Main>(.*\n)*)(?P<Fact1>[ \s]*犯罪事實.*)\n(?P<Fact1Word>(.*\n)*?)(?P<Tail>中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*') | |
regexlist.append('.*\n(?P<Location>\S+)-\S+-(?P<Caseyear>\d+)(?P<Character>\D+)(?P<Number>\d+)\n.*\n.*\s--\s(?P<Type>\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P<Judgedate>\d+).*\n\S裁判案由\S\n(?P<Judgereason>(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P<Judge>[^ \s]+)[ \s]+.*號\n(?P<Premain>(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P<Main>(.*\n)*)(?P<Fact1>[ \s]*理[ \s]+由.*)\n(?P<Fact1Word>(.*\n)*?)(?P<Tail>中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*') | |
regexlist.append('.*\n(?P<Location>\S+)-\S+-(?P<Caseyear>\d+)(?P<Character>\D+)(?P<Number>\d+)\n.*\n.*\s--\s(?P<Type>\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P<Judgedate>\d+).*\n\S裁判案由\S\n(?P<Judgereason>(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P<Judge>[^ \s]+)[ \s]+.*號\n*(?P<Premain>(.*\n)*?)(?P<Tail>中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*') | |
except: | |
print ("E001") | |
sys.exit('E001') | |
return regexlist | |
#=============================================== | |
# Get name of columns lied in table dynamically | |
#=============================================== | |
def get_all_column_name(db, cursor): | |
try: | |
sql = "SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name='refereebook'" | |
cursor.execute(sql) | |
mytuple = cursor.fetchall() | |
db.commit() | |
except: | |
db.rollback() | |
print ("E004") | |
sys.exit('E004') | |
try: | |
columns = [] | |
for emt in mytuple: | |
columns.append(emt[0]) | |
except: | |
print ("E001") | |
sys.exit('E001') | |
return columns | |
#=============================================== | |
# Try to decompose the file | |
#=============================================== | |
def anylize(filepath,writeDB): | |
# Check filepath | |
try: | |
os.path.isfile(filepath) | |
except: | |
print ("E001") | |
sys.exit('E001') | |
# Open database connectiion | |
try: | |
db = pymysql.connect(host="changethis", user="changethis", passwd="changethis", db="changethis", charset="utf8") | |
cursor = db.cursor() | |
except: | |
print ("E002") | |
sys.exit('E002') | |
# Open the file and translate into a string | |
try: | |
fp = open(filepath, 'r', encoding="utf8") | |
filecontent = "" | |
for line in fp: | |
filecontent += line | |
except: | |
print ("E001") | |
sys.exit('E001') | |
# Use regular expression | |
try: | |
columnlist = get_all_column_name(db, cursor) | |
regexlist = get_regexlist() | |
regex_ver = 0 | |
for regex in regexlist: | |
results = re.search(regex, filecontent) | |
if(results is not None): | |
break; | |
regex_ver += 1 | |
sqllist = [] | |
for column in columnlist: | |
if(re.search(column, regex) is not None): | |
if(column == "Judgedate"): | |
year = str(int(results.group(column)[0:3]) + 1911) | |
month = results.group(column)[3:5] | |
day = results.group(column)[5:7] | |
sqllist.append(str_normalize(year + '-' + month + '-' + day)) | |
else: | |
sqllist.append(str_normalize(results.group(column))) | |
else: | |
sqllist.append(None) | |
except Exception as e: | |
print ("E003") | |
sys.exit('E003') | |
if(not writeDB): | |
print (sqllist) | |
return 0 | |
# Construct SQL command and insert into table | |
if(writeDB): | |
try: | |
sql = "INSERT INTO `refereebook`(" | |
for column in columnlist[1:]: | |
sql += "`{0}`".format(column) | |
if(column == columnlist[-1]): | |
sql += ")" | |
break; | |
else: | |
sql += "," | |
sql += "VALUES (" | |
for emt in sqllist[1:]: | |
if(emt is None): | |
sql += 'NULL' | |
else: | |
sql += "\'{0}\'".format(emt) | |
if(emt == sqllist[-1]): | |
sql += ")" | |
break; | |
else: | |
sql += "," | |
cursor.execute(sql) | |
db.commit() | |
StatusCode = cursor.lastrowid | |
except Exception as e: | |
db.rollback() | |
print (e) | |
print ("E004") | |
sys.exit('E004') | |
return StatusCode | |
#=============================================== | |
# 'E001': Constant error | |
# 'E002': Database connection error | |
# 'E003': Regular expression error | |
# 'E004': SQL fail | |
# other: ID of successfully inserted record | |
#=============================================== | |
def main(argv = None): | |
if argv is None: | |
argv = sys.argv | |
if(len(sys.argv) == 2): | |
return anylize(sys.argv[1], True) | |
if(len(sys.argv) == 3): | |
return anylize(sys.argv[1], False) | |
print ("E001") | |
sys.exit('E001') | |
if __name__ == "__main__": | |
scode = main() | |
print(scode) | |
sys.exit(scode) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment