alexhanna/split_ln.py

## split_ln.py
#!/usr/bin/env python
# encoding: utf-8
"""
split_ln.py

Created by Neal Caren on 2012-05-14.
neal.caren@unc.edu

Edited by Alex Hanna on 2015-01-29
alex.hanna@gmail.com

Takes a downloaded plain text LexisNexis file and converts it into a CSV file or set of flat files.

"""

import argparse, csv, os, re, sys
from datetime import datetime

parser = argparse.ArgumentParser(description='Parse Lexis-Nexis files into different outputs.')
parser.add_argument('files', metavar='file', type=str, nargs='+', help='Lexis-Nexis files to be parsed.')
parser.add_argument('--output_dir', dest='output', action='store', help='Directory in which to store the output.')
parser.add_argument('--sep', dest='sep', const='sep', default='csv', action='store_const',
    help = 'Flag to store output in separate files.')

args = parser.parse_args()

if args.output:
    if not os.path.isdir(args.output):
        print("Not a valid directory.")
        sys.exit(-1)
else:
    args.output = "."

## set permanent columns
header = ['SEARCH_ID', 'PUBLICATION', 'DATE', 'TITLE', 'EDITION']

if args.sep == 'csv':
    ## use today as a hash to store
    today_str = datetime.today().strftime('%Y-%m-%d')
    outname   = "%s/lexis-nexis_%s.csv" % (args.output, today_str)

    # setup the output file
    outfile = open(outname,'wb')
    writer  = csv.writer(outfile)

for fn in args.files:
    print('Processing %s' % fn)
    header_written = False

    # read the file
    lnraw = open(fn).read()

    # silly hack to find the end of the documents
    workfile = re.sub('                Copyright .*?\\r\\n','ENDOFILE',lnraw)

    # clean up crud at the beginning of the file
    workfile = workfile.replace('\xef\xbb\xbf\r\n','')

    # split the file into a list of documents
    workfile = workfile.split('ENDOFILE')

    # remove blank rows
    workfile = [f for f in workfile if len(f.split('\r\n\r\n')) > 2]

    # Figure out what metadata is being reported
    meta_list  = list(set(re.findall('\\n([A-Z][A-Z-]*?):',lnraw)))

    # Keep only the commonly occuring metadata
    meta_list = [m for m in meta_list if float(lnraw.count(m)) / len(workfile) > .20]

    if args.sep == 'csv':
        header.extend(meta_list)
        header.append('TEXT')

        ## write header if this hasn't been done
        ## TK: Not sure how to deal with the case where metadata changes
        ## between different input files
        if not header_written:
            writer.writerow(header)
            header_written = True

    ## Begin loop over each article
    for f in workfile:
        # Split into lines, and clean up the hard returns at the end of each line.
        # Also removes blank lines that the occasional copyright lines
        filessplit = [row.replace('\r\n', ' ').strip() for row in f.split('\r\n\r\n') if len(row) > 0 and 'All Rights Reserved' not in row]

        ## make metadata dict
        meta_dict  = {k : '' for k in header}

        doc_id  = filessplit[0].strip().split(' ')[0]
        pub     = filessplit[1].strip()
        date_ed = filessplit[2].strip()
        title   = filessplit[3].strip()

        ## format date into YYYY-MM-DD
        da   = date_ed.replace(',', '').split()
        date = datetime.strptime(" ".join(da[0:3]), "%B %d %Y")
        date = date.strftime("%Y-%m-%d")

        ## format edition
        ## TK: maybe remove?
        ed = date_ed.replace(date,'').split('                         ')[-1].lstrip()

        ## if edition is a time or day, skip it
        if 'GMT' in ed or 'day' in ed:
            ed = ''

        ## Edit the text and other information
        paragraphs = []
        for line in filessplit[5:]:
            ## find out if this line is part of the main text
            if len(line) > 0 and line[:2] != '  ' and line != line.upper() and len(re.findall('^[A-Z][A-Z-]*?:',line)) == 0 and title not in line:
                ## remove new lines
                line = re.sub(r'\s+', ' ', line)

                ## not sure what this does
                line = line.replace('","','" , "')

                ## add to paragraph array
                paragraphs.append(line)
            else:
                metacheck = re.findall('^([A-Z][A-Z-]*?):', line)
                if len(metacheck) > 0:
                    if metacheck[0] in meta_list:
                       meta_dict[metacheck[0]] = line.replace(metacheck[0] + ': ','')

        ## put everything in the metadata dictionary
        meta_dict['PUBLICATION'] = pub
        meta_dict['SEARCH_ID']  = doc_id
        meta_dict['DATE']        = date
        meta_dict['TITLE']       = title
        meta_dict['EDITION']     = ed

        if args.sep == 'csv':
            ## add the text to the dict to write
            meta_dict['TEXT'] = " ".join(paragraphs)

            # Output the results to a single csv file
            writer.writerow( [ meta_dict[x] for x in header ] )
        else:
            ## otherwise, store as separate files
            ## put each piece of meta info on a single line
            out = "%s/%s_%s.txt" % (args.output, doc_id, date)
            fh  = open(out, 'w')

            ## write title and date first for separate files
            fh.write('TITLE: %s\n' % meta_dict['TITLE'])
            fh.write('DATE: %s\n'  % meta_dict['DATE'])

            ## then write the rest
            for k,v in meta_dict.iteritems():
                if k not in ['TITLE', 'DICT']:
                    fh.write('%s: %s\n' % (k,v))

            ## write the text last
            fh.write("\n\n".join(paragraphs) + "\n")

            fh.close()

        print('Wrote %s' % doc_id)

if args.sep == 'csv':
    outfile.close()
	#!/usr/bin/env python
	# encoding: utf-8
	"""
	split_ln.py

	Created by Neal Caren on 2012-05-14.
	neal.caren@unc.edu

	Edited by Alex Hanna on 2015-01-29
	alex.hanna@gmail.com

	Takes a downloaded plain text LexisNexis file and converts it into a CSV file or set of flat files.

	"""

	import argparse, csv, os, re, sys
	from datetime import datetime

	parser = argparse.ArgumentParser(description='Parse Lexis-Nexis files into different outputs.')
	parser.add_argument('files', metavar='file', type=str, nargs='+', help='Lexis-Nexis files to be parsed.')
	parser.add_argument('--output_dir', dest='output', action='store', help='Directory in which to store the output.')
	parser.add_argument('--sep', dest='sep', const='sep', default='csv', action='store_const',
	help = 'Flag to store output in separate files.')

	args = parser.parse_args()

	if args.output:
	if not os.path.isdir(args.output):
	print("Not a valid directory.")
	sys.exit(-1)
	else:
	args.output = "."

	## set permanent columns
	header = ['SEARCH_ID', 'PUBLICATION', 'DATE', 'TITLE', 'EDITION']

	if args.sep == 'csv':
	## use today as a hash to store
	today_str = datetime.today().strftime('%Y-%m-%d')
	outname = "%s/lexis-nexis_%s.csv" % (args.output, today_str)

	# setup the output file
	outfile = open(outname,'wb')
	writer = csv.writer(outfile)

	for fn in args.files:
	print('Processing %s' % fn)
	header_written = False

	# read the file
	lnraw = open(fn).read()

	# silly hack to find the end of the documents
	workfile = re.sub(' Copyright .*?\\r\\n','ENDOFILE',lnraw)

	# clean up crud at the beginning of the file
	workfile = workfile.replace('\xef\xbb\xbf\r\n','')

	# split the file into a list of documents
	workfile = workfile.split('ENDOFILE')

	# remove blank rows
	workfile = [f for f in workfile if len(f.split('\r\n\r\n')) > 2]

	# Figure out what metadata is being reported
	meta_list = list(set(re.findall('\\n([A-Z][A-Z-]*?):',lnraw)))

	# Keep only the commonly occuring metadata
	meta_list = [m for m in meta_list if float(lnraw.count(m)) / len(workfile) > .20]

	if args.sep == 'csv':
	header.extend(meta_list)
	header.append('TEXT')

	## write header if this hasn't been done
	## TK: Not sure how to deal with the case where metadata changes
	## between different input files
	if not header_written:
	writer.writerow(header)
	header_written = True

	## Begin loop over each article
	for f in workfile:
	# Split into lines, and clean up the hard returns at the end of each line.
	# Also removes blank lines that the occasional copyright lines
	filessplit = [row.replace('\r\n', ' ').strip() for row in f.split('\r\n\r\n') if len(row) > 0 and 'All Rights Reserved' not in row]

	## make metadata dict
	meta_dict = {k : '' for k in header}

	doc_id = filessplit[0].strip().split(' ')[0]
	pub = filessplit[1].strip()
	date_ed = filessplit[2].strip()
	title = filessplit[3].strip()

	## format date into YYYY-MM-DD
	da = date_ed.replace(',', '').split()
	date = datetime.strptime(" ".join(da[0:3]), "%B %d %Y")
	date = date.strftime("%Y-%m-%d")

	## format edition
	## TK: maybe remove?
	ed = date_ed.replace(date,'').split(' ')[-1].lstrip()

	## if edition is a time or day, skip it
	if 'GMT' in ed or 'day' in ed:
	ed = ''

	## Edit the text and other information
	paragraphs = []
	for line in filessplit[5:]:
	## find out if this line is part of the main text
	if len(line) > 0 and line[:2] != ' ' and line != line.upper() and len(re.findall('^[A-Z][A-Z-]*?:',line)) == 0 and title not in line:
	## remove new lines
	line = re.sub(r'\s+', ' ', line)

	## not sure what this does
	line = line.replace('","','" , "')

	## add to paragraph array
	paragraphs.append(line)
	else:
	metacheck = re.findall('^([A-Z][A-Z-]*?):', line)
	if len(metacheck) > 0:
	if metacheck[0] in meta_list:
	meta_dict[metacheck[0]] = line.replace(metacheck[0] + ': ','')

	## put everything in the metadata dictionary
	meta_dict['PUBLICATION'] = pub
	meta_dict['SEARCH_ID'] = doc_id
	meta_dict['DATE'] = date
	meta_dict['TITLE'] = title
	meta_dict['EDITION'] = ed

	if args.sep == 'csv':
	## add the text to the dict to write
	meta_dict['TEXT'] = " ".join(paragraphs)

	# Output the results to a single csv file
	writer.writerow( [ meta_dict[x] for x in header ] )
	else:
	## otherwise, store as separate files
	## put each piece of meta info on a single line
	out = "%s/%s_%s.txt" % (args.output, doc_id, date)
	fh = open(out, 'w')

	## write title and date first for separate files
	fh.write('TITLE: %s\n' % meta_dict['TITLE'])
	fh.write('DATE: %s\n' % meta_dict['DATE'])

	## then write the rest
	for k,v in meta_dict.iteritems():
	if k not in ['TITLE', 'DICT']:
	fh.write('%s: %s\n' % (k,v))

	## write the text last
	fh.write("\n\n".join(paragraphs) + "\n")

	fh.close()

	print('Wrote %s' % doc_id)

	if args.sep == 'csv':
	outfile.close()