Skip to content

Instantly share code, notes, and snippets.

@bquistorff
Created November 20, 2014 02:11
Show Gist options
  • Save bquistorff/c5f315077827ab83cd3d to your computer and use it in GitHub Desktop.
Save bquistorff/c5f315077827ab83cd3d to your computer and use it in GitHub Desktop.
Normalizes a Stata gph file
#! /usr/bin/env python
'''
Normalizes a Stata .gph graph file to remove build-specific information.
Filepaths, dates, times, and random IDs are converted so that the file
will be the same no matter where it is built.
Pass in a file pattern (that is quoted so the shell doesn't expand it).
To do:
-For serset, what if string variable?
'''
import os, sys, tempfile, shutil, re, glob, getopt
# See http://code.activestate.com/recipes/66434-change-line-endings/ for general
def unixline(line):
# from Windows
if len(line)>=2 and line[-2:]=="\r\n":
return line[:-2]+'\n'
# from Mac
if len(line)>=1 and line[-1:]=="\r":
return line[:-1]+'\n'
return line
# splits line and makes sure each chunk's padding after
# zero-termination is zeroed out.
def zero_pad_list(line, numseries):
width = len(line)/numseries
newline = ''
for series in range(0, numseries):
startloc = series * width
loc_zero = line.find('\x00', startloc, startloc + width)
if(loc_zero!=-1):
newline += line[startloc:loc_zero] + '\x00'*(width - (loc_zero - startloc))
else:
newline += line[startloc:startloc+width]
return newline
def normalize_file(in_fname, newfile=False):
in_fh = open(in_fname, "rb")
if newfile:
out_fname = in_fname + ".nor"
out_fh = open(out_fname, "wb")
print "Normalizing: " + in_fname + " > " + out_fname
else:
out_fh = tempfile.NamedTemporaryFile("wb", delete=False)
out_fname = out_fh.name
print "Normalizing: " + in_fname
idnum = 1
replace_list = {}
inserset = False
for line in in_fh:
# Remove the time- and filesystem-specific info
# Maybe I should be writing blank values for the RHS ones, but works currently
if line.find("*! command_date:") == 0 or line.find(".date ") == 0 or \
line.find("*! command_time:") == 0 or line.find(".time ") == 0 or \
line.find("*! datafile:") == 0 or line.find(".dta_file") == 0 or \
line.find("*! datafile_date:") == 0 or line.find(".dta_date ") == 0 :
continue
if line.find("<BeginSerset>") == 0:
numseries = 0
if line.find("<BeginSeries>") == 0:
numseries = numseries + 1
# Don't do id replacing or line conversion in serset
# Also zero out the padding in the string fields
if inserset:
if line.find("<EndSersetData>") == 0:
start_vlist = 26 + numseries
start_flist = start_vlist + numseries*54
start_data = start_flist + numseries*49
out_fh.write(sersetline[:start_vlist])
out_fh.write(zero_pad_list(sersetline[start_vlist:start_flist], numseries))
out_fh.write(zero_pad_list(sersetline[start_flist:start_data], numseries))
out_fh.write(unixline(sersetline[start_data:]))
out_fh.write(unixline(line))
inserset = False
else:
sersetline = sersetline+line
continue
if line.find("sersetreadwrite") == 0:
inserset = True
sersetline = line
continue
# Windows vs Unix string formating of decimals can be off by a digit
mat = re.match("^(\.m(in|ax) =.+\.[0-9]{12})[0-9]+", line)
if(mat != None):
out_fh.write(mat.group(1)+'\n')
continue
# Replace the random ids with deterministic ones
# (same seed doesn't seem to make these deterministic)
mat = re.match("<BeginItem> [^ ]+ ([^\s]+)", line)
if(mat != None):
replace_list[mat.group(1)] = 'K%07x' % idnum
idnum = idnum + 1
for k, v in replace_list.iteritems():
line = line.replace(k,v)
out_fh.write(unixline(line))
in_fh.close()
out_fh.close()
if not newfile:
shutil.copyfile(out_fname, in_fname)
os.remove(out_fname)
def error(message):
sys.stderr.write(message + '\n')
sys.exit(1)
def main(argv):
try:
optlist, files = getopt.getopt(argv[1:], "hn")
except getopt.GetoptError as err:
print err
error('usage: %s [-h] [-n] files' % os.path.basename(argv[0]))
newfile = False
for o, a in optlist:
if o == "-h":
print "Usage: %s [-h] [-n] files"
print "Options:"
print " -h Print help"
print " -n Make new files with extension .nor"
print "Example:"
print " %s ../fig/gph/plot1.gph ../fig/gph/plot2.gph" % os.path.basename(argv[0])
sys.exit(0)
elif o == "-n":
newfile = True
else:
error('unhandled option [%s,%s]' % (o, a))
if len(files)==0:
error("No files found")
else:
for file in files:
normalize_file(file.strip(), newfile)
if __name__ == "__main__":
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment