Skip to content

Instantly share code, notes, and snippets.

@statsmaths
Created November 24, 2015 18:27
Show Gist options
  • Save statsmaths/70c40dfd75ab48a019d2 to your computer and use it in GitHub Desktop.
Save statsmaths/70c40dfd75ab48a019d2 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
""" Parses a directory of XML files and saves delimited parsed records
Set the DIR_IN variable to the location where you have a set of XML
records. This script will then save a pipe separated file for each
input file. There is one row per token in the input file, and three
columns of data: the raw token, the lemmatized form of the token, and
a part of speech tag. The part of speech tag are only the basic universal
ones (VERB, NOUN, DET, ect.) not the full Penn TreeBank codes. The benefit
of using spacy as a library is that is has a much greater speed when compared
to other methods.
This can be used in other generic applications where you want to apply
a basic part of speech tagger to a corpus of files. Simply do not call the
application specific replace_date function and you will ready to go.
"""
from __future__ import print_function
import os
import re
import spacy.en
NLP = spacy.en.English()
DIR_IN = "blogs"
DIR_OUT = "blogs_out"
VERBOSE = True
def get_files():
""" get lists of input and output files """
files_in = [DIR_IN + "/" + x for x in os.listdir(DIR_IN)]
files_out = [DIR_OUT + "/" + x + ".csv" for x in os.listdir(DIR_IN)]
return files_in, files_out
def parse_this_file(fin_name, fout_name):
""" reads data from fin_name, parses uses spacy, and saves in fout_name """
fin = open(fin_name, mode='r')
fout = open(fout_name, mode='w')
text = fin.read()
text = replace_date(text)
text = clean_string(text)
tokens = NLP(text, tag=True, parse=False)
for tok in tokens:
out = tok.text + '|' + tok.lemma_ + '|' + tok.pos_ + '\n'
out = out.encode('ascii', errors='ignore')
fout.write(out)
fin.close()
fout.close()
def clean_string(text):
""" give a string object, cleans XML and returns unicode """
text = re.sub('<date>[^>]+</date>', "BPOST", text)
text = unicode(text, errors='replace')
text = re.sub(u'<[^<]+>', "", text)
text = re.sub(u'\n', '', text)
text = re.sub(u'\t', '', text)
text = re.sub(u'\r', '', text)
text = re.sub(u'\\|', '', text)
text = re.sub(u'[ ]+', ' ', text)
return text
def replace_date(text):
""" replaces the date XML tag with the BPOST; boundary of the post """
return re.sub('<date>[^>]+</date>', "BPOST", text)
def main():
""" calculates and parses set of files in DIR_IN """
files_in, files_out = get_files()
if not os.path.exists(DIR_OUT):
os.mkdir(DIR_OUT)
if VERBOSE:
print("Processing " + str(len(files_in)) + " input files.")
for fin_name, fout_name in zip(files_in, files_out)[18208:]:
parse_this_file(fin_name, fout_name)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment