Skip to content

Instantly share code, notes, and snippets.

@paultopia
Created June 3, 2015 00:38
Show Gist options
  • Save paultopia/9e39cb37021f9f44658d to your computer and use it in GitHub Desktop.
Save paultopia/9e39cb37021f9f44658d to your computer and use it in GitHub Desktop.
import sys, re, argparse, string, codecs
parser = argparse.ArgumentParser()
parser.add_argument("citearg", help="the file containing the citations from footnotes etc.")
parser.add_argument("refarg", help="the file containing reference list")
manuFiles = parser.parse_args()
def makeCorpoi(citefile, reffile):
citebox = open(citefile, 'r')
refbox = open(reffile, 'r')
citecorpus = citebox.read()
refcorpus = refbox.read()
citebox.close()
refbox.close()
corpoi = [str(citecorpus), str(refcorpus)]
return corpoi
def conv2ASCII(bigstring):
def convHandler(error):
return ('1FOREIGN', error.start + 1)
codecs.register_error('foreign', convHandler)
bigstring = bigstring.encode('ascii', 'foreign')
return bigstring
# stringstring = str(bigstring)
# return stringstring
def makeCiteList(citefile):
citepattern = r'[\s(][A-Z1][A-Za-z1]*-?[A-Za-z1]*[ ,]? \(?\d\d\d\d[a-z]?[\s.,)]'
rawCitelist = re.findall(citepattern, citefile)
cleanCitelist = cleanup(rawCitelist)
finalCiteList = list(set(cleanCitelist))
return(finalCiteList)
def checkCites(citefile, reffile):
corpoi = makeCorpoi(citefile, reffile)
citecorpus = corpoi[0]
citecorpus = conv2ASCII(citecorpus)
citelist = makeCiteList(citecorpus)
# and a bunch of irrelevant stuff
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment