statsmaths/parse_xml_pos.py

## parse_xml_pos.py
#!/usr/bin/env python
""" Parses a directory of XML files and saves delimited parsed records

Set the DIR_IN variable to the location where you have a set of XML
records. This script will then save a pipe separated file for each
input file. There is one row per token in the input file, and three
columns of data: the raw token, the lemmatized form of the token, and
a part of speech tag. The part of speech tag are only the basic universal
ones (VERB, NOUN, DET, ect.) not the full Penn TreeBank codes. The benefit
of using spacy as a library is that is has a much greater speed when compared
to other methods.

This can be used in other generic applications where you want to apply
a basic part of speech tagger to a corpus of files. Simply do not call the
application specific replace_date function and you will ready to go.
"""

from __future__ import print_function

import os
import re
import spacy.en

NLP = spacy.en.English()
DIR_IN = "blogs"
DIR_OUT = "blogs_out"
VERBOSE = True

def get_files():
    """ get lists of input and output files """
    files_in = [DIR_IN + "/" + x for x in os.listdir(DIR_IN)]
    files_out = [DIR_OUT + "/" + x + ".csv" for x in os.listdir(DIR_IN)]
    return files_in, files_out


def parse_this_file(fin_name, fout_name):
    """ reads data from fin_name, parses uses spacy, and saves in fout_name """
    fin = open(fin_name, mode='r')
    fout = open(fout_name, mode='w')
    text = fin.read()
    text = replace_date(text)
    text = clean_string(text)
    tokens = NLP(text, tag=True, parse=False)
    for tok in tokens:
        out = tok.text + '|' + tok.lemma_ + '|' + tok.pos_ + '\n'
        out = out.encode('ascii', errors='ignore')
        fout.write(out)
    fin.close()
    fout.close()


def clean_string(text):
    """ give a string object, cleans XML and returns unicode """
    text = re.sub('<date>[^>]+</date>', "BPOST", text)
    text = unicode(text, errors='replace')
    text = re.sub(u'<[^<]+>', "", text)
    text = re.sub(u'\n', '', text)
    text = re.sub(u'\t', '', text)
    text = re.sub(u'\r', '', text)
    text = re.sub(u'\\|', '', text)
    text = re.sub(u'[ ]+', ' ', text)
    return text


def replace_date(text):
    """ replaces the date XML tag with the BPOST; boundary of the post """
    return re.sub('<date>[^>]+</date>', "BPOST", text)


def main():
    """ calculates and parses set of files in DIR_IN """
    files_in, files_out = get_files()
    if not os.path.exists(DIR_OUT):
        os.mkdir(DIR_OUT)

    if VERBOSE:
        print("Processing " + str(len(files_in)) + " input files.")

    for fin_name, fout_name in zip(files_in, files_out)[18208:]:
        parse_this_file(fin_name, fout_name)


if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	""" Parses a directory of XML files and saves delimited parsed records

	Set the DIR_IN variable to the location where you have a set of XML
	records. This script will then save a pipe separated file for each
	input file. There is one row per token in the input file, and three
	columns of data: the raw token, the lemmatized form of the token, and
	a part of speech tag. The part of speech tag are only the basic universal
	ones (VERB, NOUN, DET, ect.) not the full Penn TreeBank codes. The benefit
	of using spacy as a library is that is has a much greater speed when compared
	to other methods.

	This can be used in other generic applications where you want to apply
	a basic part of speech tagger to a corpus of files. Simply do not call the
	application specific replace_date function and you will ready to go.
	"""

	from __future__ import print_function

	import os
	import re
	import spacy.en

	NLP = spacy.en.English()
	DIR_IN = "blogs"
	DIR_OUT = "blogs_out"
	VERBOSE = True

	def get_files():
	""" get lists of input and output files """
	files_in = [DIR_IN + "/" + x for x in os.listdir(DIR_IN)]
	files_out = [DIR_OUT + "/" + x + ".csv" for x in os.listdir(DIR_IN)]
	return files_in, files_out


	def parse_this_file(fin_name, fout_name):
	""" reads data from fin_name, parses uses spacy, and saves in fout_name """
	fin = open(fin_name, mode='r')
	fout = open(fout_name, mode='w')
	text = fin.read()
	text = replace_date(text)
	text = clean_string(text)
	tokens = NLP(text, tag=True, parse=False)
	for tok in tokens:
	out = tok.text + '\|' + tok.lemma_ + '\|' + tok.pos_ + '\n'
	out = out.encode('ascii', errors='ignore')
	fout.write(out)
	fin.close()
	fout.close()


	def clean_string(text):
	""" give a string object, cleans XML and returns unicode """
	text = re.sub('<date>[^>]+</date>', "BPOST", text)
	text = unicode(text, errors='replace')
	text = re.sub(u'<[^<]+>', "", text)
	text = re.sub(u'\n', '', text)
	text = re.sub(u'\t', '', text)
	text = re.sub(u'\r', '', text)
	text = re.sub(u'\\\|', '', text)
	text = re.sub(u'[ ]+', ' ', text)
	return text


	def replace_date(text):
	""" replaces the date XML tag with the BPOST; boundary of the post """
	return re.sub('<date>[^>]+</date>', "BPOST", text)


	def main():
	""" calculates and parses set of files in DIR_IN """
	files_in, files_out = get_files()
	if not os.path.exists(DIR_OUT):
	os.mkdir(DIR_OUT)

	if VERBOSE:
	print("Processing " + str(len(files_in)) + " input files.")

	for fin_name, fout_name in zip(files_in, files_out)[18208:]:
	parse_this_file(fin_name, fout_name)


	if __name__ == '__main__':
	main()