Skip to content

Instantly share code, notes, and snippets.

@thisismattmiller
Created February 22, 2012 06:27
Show Gist options
  • Save thisismattmiller/1882244 to your computer and use it in GitHub Desktop.
Save thisismattmiller/1882244 to your computer and use it in GitHub Desktop.
Script to parse visualMOA.org SGML files using NLTK
#!/usr/bin/env python
import optparse
import sys
from lxml import etree
import time
import nltk
from pymongo import Connection
import os
import hashlib
tokenizer_sentences = None
tokenizer_words = None
tagger = None
db = None
connection = None
posts = None
def main():
global tokenizer_sentences
global tokenizer_words
global tagger
global db
global connection
global posts
p = optparse.OptionParser()
p.add_option('--dir', '-d', default="")
options, arguments = p.parse_args()
if options.dir == "":
print 'No file given'
sys.exit()
tokenizer_sentences = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer_words = nltk.tokenize.RegexpTokenizer(r'\w+|[^\w\s]+')
tagger = nltk.UnigramTagger(nltk.corpus.brown.tagged_sents())
connection = Connection()
connection = Connection('localhost', 27017)
db = connection.moa
posts = db.journals
dirList=os.listdir(options.dir)
for fname in dirList:
process_file(options.dir + fname)
def process_file(file_name):
global tokenizer_sentences
global tokenizer_words
global tagger
global db
global connection
global posts
print "\n\n" + file_name + ":"
try:
data_string = open(file_name, 'r').read()
except IOError:
print 'Could not find that file'
sys.exit()
data_tree = etree.fromstring(data_string)
found_count = 0;
for tei in data_tree.findall('.//TEI.2'):
serial_date = ''
for date_serial in tei.findall('.//DATE'):
#is this the scan date or what?
try:
int(date_serial.text)
except ValueError:
#it is the journal date
print "\t" + date_serial.text
#fix some little mistakes from the data imputers
date_serial.text = date_serial.text.replace(', ',' ')
date_serial.text = date_serial.text.replace(',',' ')
date_serial.text = date_serial.text.replace('. ',' ')
date_serial.text = date_serial.text.replace('.',' ')
if (date_serial.text.find('-') != -1):
date_serial.text = date_serial.text[0:date_serial.text.find('-')-1] + ' ' + date_serial.text[len(date_serial.text)-4:]
date_serial.text = date_serial.text.replace('Octoberl','October')
try:
serial_date = time.strptime(date_serial.text, "%B %Y")
except ValueError:
try:
date_serial.text = date_serial.text.replace('Sept ','Sep ')
serial_date = time.strptime(date_serial.text, "%b %Y")
except ValueError:
#it might be in a month day year format
try:
serial_date = time.strptime(date_serial.text, "%b %d %Y")
except ValueError:
#ugh....
try:
serial_date = time.strptime(date_serial.text, "%B %d %Y")
except ValueError:
#give up, ask for help
try:
print date_serial.text + ' could not be phrased as a date!'
date_serial.text = raw_input("Enter date in format (Mmm yyyy): ")
serial_date = time.strptime(date_serial.text, "%b %Y")
except ValueError:
print date_serial.text + ' could not be phrased as a date!'
sys.exit()
if serial_date == '':
#this might be the 'notes of digital production section, so if so its okay' there are no PBs in this section
if date_serial.text != '1999':
print "Error: Could not locate a data for this section"
sys.exit()
for div1 in tei.findall('.//DIV1'):
article_title = ''
article_type = ''
for title in div1.findall('.//TITLE'):
if title.attrib['TYPE'] is None:
article_type = 'Unkown'
else:
article_type = title.attrib['TYPE']
if title.text is None:
article_title = 'Unkown Title'
else:
article_title = title.text
print "\t" + article_title + ' :: ' + article_type,
if div1.attrib['TYPE'] is None:
article_meta_type = 'Unkown Type'
else:
article_meta_type = div1.attrib['TYPE']
if div1.attrib['DECLS'] is None:
article_meta_decls = 'Unkown Decls'
else:
article_meta_decls = div1.attrib['DECLS']
text_article = ''
text_segment = ''
for text in div1.findall('.//PB'):
print ".",
found_count=found_count+1
article_meta_page = text.attrib['REF']
article_meta_seq = text.attrib['SEQ']
text_segment = str(text.text)
text_segment = text_segment.replace('~\n','')
text_segment = text_segment.replace('-\n','')
text_segment = text_segment.replace('\n',' ')
text_segment = text_segment.replace('\t','')
text_segment = text_segment.strip()
text_sentences = (tokenizer_sentences.tokenize(text_segment))
for sentence in text_sentences:
sentence=sentence.replace('\n','')
sentence=sentence.replace('\t','')
sentence=sentence.replace('\r','')
#this shit is seriously dirty :(
if len(sentence) > 30:
#print sentence + "\n--"
tokenized = tokenizer_words.tokenize(sentence)
tagged = tagger.tag(tokenized)
#print tagged
nouns=[]
verbs=[]
propers = []
for index, item in enumerate(tagged):
#print index, item
#print item[0]
noun_proper = ''
#first find the proper nouns
if (item[1] == 'NP') or (item[1] == 'NP-TL') or (item[1] == 'NPS-TL') or (item[1] == 'NNP') or (item[1] == 'NN$
#now see if there is another proper noun afterwards meanin a possible name
noun_proper = item[0]
#see if there is even the possiblity
if index+1 <= len(tagged) - 1:
if (tagged[index+1][0].istitle() == True):
#print "Two word proper noun!" + tagged[index][0] + ' ' + tagged[index+1][0]
noun_proper = tagged[index][0] + ' ' + tagged[index+1][0]
if index+2 <= len(tagged) - 1:
if tagged[index+2][0].istitle() == True:
#print "Three word proper noun!"
noun_proper = tagged[index][0] + ' ' + tagged[index+1][0] + ' ' + tagge$
#print noun_proper
#some times we get a word part of the proper noun that is not a noun, such as "new york"
if noun_proper.find(' ') == -1:
if index-1 >= 0:
if (tagged[index-1][0].istitle() == True):
noun_proper = tagged[index-1][0] + ' ' + tagged[index][0]
if (noun_proper!='') and noun_proper not in propers:
if (len(noun_proper))>2:
propers.append(noun_proper.lower())
#now find the rest of teh nouns
if (item[1] == 'NN') or (item[1] == 'NNS') or (item[1] == 'NN-TL') or (item[1] == 'NNS-TL') or (item[1] == None$
#make sure it is not in the proper noun if we found it above
if noun_proper.find(item[0]) == -1:
#see if it has a imporant adjetive for before it, (that is not the first word of the sentance)
if index-1 > 0:
if tagged[index-1][0].istitle() == True:
nouns.append(tagged[index-1][0].lower() + ' ' + item[0].lower())
else:
nouns.append(item[0].lower())
#verbs
if (item[1] != None) and (item[1][0:2] == 'VB'):
verbs.append(item[0].lower())
post = {"date": float(str(serial_date.tm_year) + '.' + str(serial_date.tm_mon)),
"year": int(serial_date.tm_year),
"article_title": article_title,
"article_type" : article_type,
"text_meta_type" : article_meta_type,
"text_meta_decls" : article_meta_decls,
"text_meta_page" : article_meta_page,
"article_id" : hashlib.md5(article_meta_decls + article_title).hexdigest(),
"sentence" : sentence,
"nouns" : nouns,
"verbs" : verbs,
"propers" : propers}
posts.insert(post)
print "~"
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment