Skip to content

Instantly share code, notes, and snippets.

@alexhanna
Last active February 19, 2020 03:43
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save alexhanna/56ac247ca65d9d9b2b8a to your computer and use it in GitHub Desktop.
Save alexhanna/56ac247ca65d9d9b2b8a to your computer and use it in GitHub Desktop.
Script for splitting Lexis-Nexis files. Adapted from an original from Neal Caren.
#!/usr/bin/env python
# encoding: utf-8
"""
split_ln.py
Created by Neal Caren on 2012-05-14.
neal.caren@unc.edu
Edited by Alex Hanna on 2015-01-29
alex.hanna@gmail.com
Takes a downloaded plain text LexisNexis file and converts it into a CSV file or set of flat files.
"""
import argparse, csv, os, re, sys
from datetime import datetime
parser = argparse.ArgumentParser(description='Parse Lexis-Nexis files into different outputs.')
parser.add_argument('files', metavar='file', type=str, nargs='+', help='Lexis-Nexis files to be parsed.')
parser.add_argument('--output_dir', dest='output', action='store', help='Directory in which to store the output.')
parser.add_argument('--sep', dest='sep', const='sep', default='csv', action='store_const',
help = 'Flag to store output in separate files.')
args = parser.parse_args()
if args.output:
if not os.path.isdir(args.output):
print("Not a valid directory.")
sys.exit(-1)
else:
args.output = "."
## set permanent columns
header = ['SEARCH_ID', 'PUBLICATION', 'DATE', 'TITLE', 'EDITION']
if args.sep == 'csv':
## use today as a hash to store
today_str = datetime.today().strftime('%Y-%m-%d')
outname = "%s/lexis-nexis_%s.csv" % (args.output, today_str)
# setup the output file
outfile = open(outname,'wb')
writer = csv.writer(outfile)
for fn in args.files:
print('Processing %s' % fn)
header_written = False
# read the file
lnraw = open(fn).read()
# silly hack to find the end of the documents
workfile = re.sub(' Copyright .*?\\r\\n','ENDOFILE',lnraw)
# clean up crud at the beginning of the file
workfile = workfile.replace('\xef\xbb\xbf\r\n','')
# split the file into a list of documents
workfile = workfile.split('ENDOFILE')
# remove blank rows
workfile = [f for f in workfile if len(f.split('\r\n\r\n')) > 2]
# Figure out what metadata is being reported
meta_list = list(set(re.findall('\\n([A-Z][A-Z-]*?):',lnraw)))
# Keep only the commonly occuring metadata
meta_list = [m for m in meta_list if float(lnraw.count(m)) / len(workfile) > .20]
if args.sep == 'csv':
header.extend(meta_list)
header.append('TEXT')
## write header if this hasn't been done
## TK: Not sure how to deal with the case where metadata changes
## between different input files
if not header_written:
writer.writerow(header)
header_written = True
## Begin loop over each article
for f in workfile:
# Split into lines, and clean up the hard returns at the end of each line.
# Also removes blank lines that the occasional copyright lines
filessplit = [row.replace('\r\n', ' ').strip() for row in f.split('\r\n\r\n') if len(row) > 0 and 'All Rights Reserved' not in row]
## make metadata dict
meta_dict = {k : '' for k in header}
doc_id = filessplit[0].strip().split(' ')[0]
pub = filessplit[1].strip()
date_ed = filessplit[2].strip()
title = filessplit[3].strip()
## format date into YYYY-MM-DD
da = date_ed.replace(',', '').split()
date = datetime.strptime(" ".join(da[0:3]), "%B %d %Y")
date = date.strftime("%Y-%m-%d")
## format edition
## TK: maybe remove?
ed = date_ed.replace(date,'').split(' ')[-1].lstrip()
## if edition is a time or day, skip it
if 'GMT' in ed or 'day' in ed:
ed = ''
## Edit the text and other information
paragraphs = []
for line in filessplit[5:]:
## find out if this line is part of the main text
if len(line) > 0 and line[:2] != ' ' and line != line.upper() and len(re.findall('^[A-Z][A-Z-]*?:',line)) == 0 and title not in line:
## remove new lines
line = re.sub(r'\s+', ' ', line)
## not sure what this does
line = line.replace('","','" , "')
## add to paragraph array
paragraphs.append(line)
else:
metacheck = re.findall('^([A-Z][A-Z-]*?):', line)
if len(metacheck) > 0:
if metacheck[0] in meta_list:
meta_dict[metacheck[0]] = line.replace(metacheck[0] + ': ','')
## put everything in the metadata dictionary
meta_dict['PUBLICATION'] = pub
meta_dict['SEARCH_ID'] = doc_id
meta_dict['DATE'] = date
meta_dict['TITLE'] = title
meta_dict['EDITION'] = ed
if args.sep == 'csv':
## add the text to the dict to write
meta_dict['TEXT'] = " ".join(paragraphs)
# Output the results to a single csv file
writer.writerow( [ meta_dict[x] for x in header ] )
else:
## otherwise, store as separate files
## put each piece of meta info on a single line
out = "%s/%s_%s.txt" % (args.output, doc_id, date)
fh = open(out, 'w')
## write title and date first for separate files
fh.write('TITLE: %s\n' % meta_dict['TITLE'])
fh.write('DATE: %s\n' % meta_dict['DATE'])
## then write the rest
for k,v in meta_dict.iteritems():
if k not in ['TITLE', 'DICT']:
fh.write('%s: %s\n' % (k,v))
## write the text last
fh.write("\n\n".join(paragraphs) + "\n")
fh.close()
print('Wrote %s' % doc_id)
if args.sep == 'csv':
outfile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment