Skip to content

Instantly share code, notes, and snippets.

@jackycute
Created December 7, 2015 03:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jackycute/d6600792d6ec40792cd3 to your computer and use it in GitHub Desktop.
Save jackycute/d6600792d6ec40792cd3 to your computer and use it in GitHub Desktop.
biglaw_splitter
#!/usr/bin/python
#coding=utf8
import os
import re
import sys
import pymysql
#===============================================
# normalize input string to prevent from errors
#===============================================
def str_normalize(str):
try:
res = ''
if str is not None:
strlist = str.split('\'')
cnt = 0
for value in strlist:
if(cnt > 0):
res += '\\\''
cnt += 1
res += value
except:
print ("E001")
sys.exit('E001')
return res
#===============================================
# Put many regular expression in proper order
#===============================================
def get_regexlist():
try:
regexlist = []
regexlist.append('.*\n(?P<Location>\S+)-\S+-(?P<Caseyear>\d+)(?P<Character>\D+)(?P<Number>\d+)\n.*\n.*\s--\s(?P<Type>\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P<Judgedate>\d+).*\n\S裁判案由\S\n(?P<Judgereason>(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P<Judge>[^ \s]+)[ \s]+.*號\n(?P<Premain>(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P<Main>(.*\n)*)(?P<Fact1>[\s ]+事[ \s]+實.*)\n(?P<Fact1Word>(.*\n)*)(?P<Fact2>[ \s]*理[ \s]+由.*)\n(?P<Fact2Word>(.*\n)*?)(?P<Tail>中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*')
regexlist.append('.*\n(?P<Location>\S+)-\S+-(?P<Caseyear>\d+)(?P<Character>\D+)(?P<Number>\d+)\n.*\n.*\s--\s(?P<Type>\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P<Judgedate>\d+).*\n\S裁判案由\S\n(?P<Judgereason>(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P<Judge>[^ \s]+)[ \s]+.*號\n(?P<Premain>(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P<Main>(.*\n)*)(?P<Fact1>[ \s]*事[ \s]*實及理[ \s]*由.*)\n(?P<Fact1Word>(.*\n)*?)(?P<Tail>中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*')
regexlist.append('.*\n(?P<Location>\S+)-\S+-(?P<Caseyear>\d+)(?P<Character>\D+)(?P<Number>\d+)\n.*\n.*\s--\s(?P<Type>\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P<Judgedate>\d+).*\n\S裁判案由\S\n(?P<Judgereason>(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P<Judge>[^ \s]+)[ \s]+.*號\n(?P<Premain>(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P<Main>(.*\n)*)(?P<Fact1>[\s ]+事[ \s]+實.*)\n(?P<Fact1Word>(.*\n)*?)(?P<Tail>中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*')
regexlist.append('.*\n(?P<Location>\S+)-\S+-(?P<Caseyear>\d+)(?P<Character>\D+)(?P<Number>\d+)\n.*\n.*\s--\s(?P<Type>\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P<Judgedate>\d+).*\n\S裁判案由\S\n(?P<Judgereason>(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P<Judge>[^ \s]+)[ \s]+.*號\n(?P<Premain>(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P<Main>(.*\n)*)(?P<Fact1>[ \s]*犯罪事實[ \s]*及[ \s]*理由.*)\n(?P<Fact1Word>(.*\n)*?)(?P<Tail>中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*')
regexlist.append('.*\n(?P<Location>\S+)-\S+-(?P<Caseyear>\d+)(?P<Character>\D+)(?P<Number>\d+)\n.*\n.*\s--\s(?P<Type>\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P<Judgedate>\d+).*\n\S裁判案由\S\n(?P<Judgereason>(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P<Judge>[^ \s]+)[ \s]+.*號\n(?P<Premain>(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P<Main>(.*\n)*)(?P<Fact1>[ \s]*犯罪事實.*)\n(?P<Fact1Word>(.*\n)*?)(?P<Tail>中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*')
regexlist.append('.*\n(?P<Location>\S+)-\S+-(?P<Caseyear>\d+)(?P<Character>\D+)(?P<Number>\d+)\n.*\n.*\s--\s(?P<Type>\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P<Judgedate>\d+).*\n\S裁判案由\S\n(?P<Judgereason>(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P<Judge>[^ \s]+)[ \s]+.*號\n(?P<Premain>(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P<Main>(.*\n)*)(?P<Fact1>[ \s]*理[ \s]+由.*)\n(?P<Fact1Word>(.*\n)*?)(?P<Tail>中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*')
regexlist.append('.*\n(?P<Location>\S+)-\S+-(?P<Caseyear>\d+)(?P<Character>\D+)(?P<Number>\d+)\n.*\n.*\s--\s(?P<Type>\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P<Judgedate>\d+).*\n\S裁判案由\S\n(?P<Judgereason>(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P<Judge>[^ \s]+)[ \s]+.*號\n*(?P<Premain>(.*\n)*?)(?P<Tail>中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*')
except:
print ("E001")
sys.exit('E001')
return regexlist
#===============================================
# Get name of columns lied in table dynamically
#===============================================
def get_all_column_name(db, cursor):
try:
sql = "SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name='refereebook'"
cursor.execute(sql)
mytuple = cursor.fetchall()
db.commit()
except:
db.rollback()
print ("E004")
sys.exit('E004')
try:
columns = []
for emt in mytuple:
columns.append(emt[0])
except:
print ("E001")
sys.exit('E001')
return columns
#===============================================
# Try to decompose the file
#===============================================
def anylize(filepath,writeDB):
# Check filepath
try:
os.path.isfile(filepath)
except:
print ("E001")
sys.exit('E001')
# Open database connectiion
try:
db = pymysql.connect(host="changethis", user="changethis", passwd="changethis", db="changethis", charset="utf8")
cursor = db.cursor()
except:
print ("E002")
sys.exit('E002')
# Open the file and translate into a string
try:
fp = open(filepath, 'r', encoding="utf8")
filecontent = ""
for line in fp:
filecontent += line
except:
print ("E001")
sys.exit('E001')
# Use regular expression
try:
columnlist = get_all_column_name(db, cursor)
regexlist = get_regexlist()
regex_ver = 0
for regex in regexlist:
results = re.search(regex, filecontent)
if(results is not None):
break;
regex_ver += 1
sqllist = []
for column in columnlist:
if(re.search(column, regex) is not None):
if(column == "Judgedate"):
year = str(int(results.group(column)[0:3]) + 1911)
month = results.group(column)[3:5]
day = results.group(column)[5:7]
sqllist.append(str_normalize(year + '-' + month + '-' + day))
else:
sqllist.append(str_normalize(results.group(column)))
else:
sqllist.append(None)
except Exception as e:
print ("E003")
sys.exit('E003')
if(not writeDB):
print (sqllist)
return 0
# Construct SQL command and insert into table
if(writeDB):
try:
sql = "INSERT INTO `refereebook`("
for column in columnlist[1:]:
sql += "`{0}`".format(column)
if(column == columnlist[-1]):
sql += ")"
break;
else:
sql += ","
sql += "VALUES ("
for emt in sqllist[1:]:
if(emt is None):
sql += 'NULL'
else:
sql += "\'{0}\'".format(emt)
if(emt == sqllist[-1]):
sql += ")"
break;
else:
sql += ","
cursor.execute(sql)
db.commit()
StatusCode = cursor.lastrowid
except Exception as e:
db.rollback()
print (e)
print ("E004")
sys.exit('E004')
return StatusCode
#===============================================
# 'E001': Constant error
# 'E002': Database connection error
# 'E003': Regular expression error
# 'E004': SQL fail
# other: ID of successfully inserted record
#===============================================
def main(argv = None):
if argv is None:
argv = sys.argv
if(len(sys.argv) == 2):
return anylize(sys.argv[1], True)
if(len(sys.argv) == 3):
return anylize(sys.argv[1], False)
print ("E001")
sys.exit('E001')
if __name__ == "__main__":
scode = main()
print(scode)
sys.exit(scode)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment