Skip to content

Instantly share code, notes, and snippets.

@johnsmith17th
Created May 27, 2013 05:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save johnsmith17th/5655342 to your computer and use it in GitHub Desktop.
Save johnsmith17th/5655342 to your computer and use it in GitHub Desktop.
Extract data from fml text
# -*- coding: utf-8 -*-
import codecs
sql = 'insert into fml(id, author, gender, address, content, date) values(%s, \'%s\', \'%s\', \'%s\', \'%s\', \'%s\');\n'
def getPrefix(tx):
return tx[tx.rfind('-') + 1: tx.rfind('.')]
def getDate(tx):
return tx[tx.rfind('/') + 1:tx.rfind('-')].replace('-', '/')
def getAuthor(tx):
l = tx.find(u'(')
if l != -1:
return tx[:l].strip()
l = tx.find(u'(')
if l != -1:
return tx[:l].strip()
return tx[:tx.find(' - ')].strip()
def getGender(tx):
tx = tx[:tx.find(' - ')]
if tx.find(u'匿') != -1:
return u'un'
elif tx.find(u'女') != -1:
return u'fm'
else:
return u'ma'
def getAddr(tx):
return tx[tx.find('- ') + 2:]
def extra(src):
txt = codecs.open(src, 'r' , 'utf-8').read()
txt = unicode(txt)
p = txt.split(u'###\r\n')
pre = getPrefix(src)
i = 99
of = open('sql/%s.txt' %pre, 'w')
for x in p:
l = x.split('\r\n')
ui = pre + str(i)
au = getAuthor(l[0])
ge = getGender(l[0])
ad = getAddr(l[0])
co = l[1].strip()
da = getDate(src)
print 'id: ' + ui
print 'author: ' + au
print 'gender: ' + ge
print 'addr: ' + ad
print 'content: ' + co
print 'date: ' + da
print ''
i -= 1
cmd = (sql %(ui, au, ge, ad, co, da)).encode('utf-8')
of.write(cmd)
of.close()
ss = open('path.txt').read().split('\n')
for s in ss:
extra('data/' + s)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment