Skip to content

Instantly share code, notes, and snippets.

@triplingual
Created January 22, 2019 21:35
Show Gist options
  • Save triplingual/7be8873e2900a2b66b4671fc4ccd1643 to your computer and use it in GitHub Desktop.
Save triplingual/7be8873e2900a2b66b4671fc4ccd1643 to your computer and use it in GitHub Desktop.
Python script to transform wordcount files needed for @agoldst/dfrtopics (older) from JSTOR DFR ngram XML files (newer)
# columns to create and populate:
# WORDCOUNTS, WEIGHT
import unicodecsv as csv # need to install unicodecsv, tho
import re
import os
import sys
import glob
if len(sys.argv) < 2:
print 'Looking for wordcount files in current directory . . .'
print ''
pathname = os.getcwd()
else:
print 'Using ' + sys.argv[1] + ' as working path for wordcount files.'
pathname = sys.argv[1].rstrip('/')
# Get filenames from system
metadatafilenames = glob.glob(pathname + '/*.txt')
# Start looping through metadata files
for ngrampath in metadatafilenames:
print ngrampath
head, tail = os.path.split(ngrampath)
with open(ngrampath, 'rb') as csvinput:
thereader = csv.reader(csvinput, delimiter='\t')
if (not os.path.isdir(head + '/../wordcounts')):
os.mkdir(head + '/../wordcounts', 0755)
outputfilename = tail
outputfilename = re.sub('journal-article-', 'wordcounts_', outputfilename)
outputfilename = re.sub('-ngram1', '', outputfilename)
outputfilename = re.sub('\.txt', '.CSV', outputfilename)
with open(head + '/../wordcounts/' + outputfilename, 'wb') as csvoutput:
thewriter = csv.writer(csvoutput)
thewriter.writerow(['WORDCOUNTS','WEIGHT'])
for row in thereader:
thewriter.writerow([row[0],row[1]])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment