Skip to content

Instantly share code, notes, and snippets.

@organisciak
Created March 18, 2016 06:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save organisciak/7d7fbdd0c78e2f56fa9f to your computer and use it in GitHub Desktop.
Save organisciak/7d7fbdd0c78e2f56fa9f to your computer and use it in GitHub Desktop.
Calculate frequencies in many books
from htrc_features import FeatureReader
import argparse
import pandas as pd
import numpy as np
import random
import string
def main():
parser = argparse.ArgumentParser(description='Calculate Collection '
'Frequency, Page Frequency, and Book '
'Frequency for a set of EF files.')
parser.add_argument('--outpath', type=str, default='pickles',
help='Directory to save pickles to.')
parser.add_argument('files', type=str, nargs='+',
help='list of bzip\'d EF data files')
args = parser.parse_args()
df = mapped_dfs(args.files)
# Save DF to a randomly generated pickle
filename = ''.join(random.choice(string.ascii_uppercase + string.digits)
for _ in range(8))
df.to_pickle("%s/%s.pickle" % (args.outpath, filename))
def get_freq_stats(vol):
tf = vol.tokenlist(pos=False)
tf.index = tf.index.droplevel(1).droplevel(0)
# Calculate collection frequency (for this book), page frequency,
# and book frequency
grouped = tf.groupby('token', level=0)
return grouped['count'].agg({'CF': np.sum, 'PF': len, 'BF': lambda x: 1})
def mapped_dfs(paths):
fr = FeatureReader(paths)
all_df = []
for vol in fr.volumes():
try:
all_df.append(get_freq_stats(vol))
except:
continue
return pd.concat(all_df)
if __name__ == '__main__':
main()
import pandas as pd
import argparse
import random
import string
def filename(N=10):
return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(N))
parser = argparse.ArgumentParser(description='Merge token frequency pickles.')
parser.add_argument('paths', metavar='paths', nargs='+',
help='Paths of DF pickles')
parser.add_argument('--outpath', type=str, default='pickles2',
help='Where to save the combined DF pickle')
parser.add_argument('--min-pf', type=int, default=1,
help='Filter any terms that occur on fewer than a specified number of pages.')
parser.add_argument('--min-bf', type=int, default=1,
help='Filter any terms that occur in fewer than a specified number of books.')
args = parser.parse_args()
dfs = []
for path in args.paths:
try:
dfs.append(pd.read_pickle(path))
except:
print "error with %s" % path
df = pd.concat(dfs).groupby(['token'], level=0).sum()
if args.min_pf > 1:
df = df[df['PF'] >= args.min_pf]
if args.min_bf > 1:
df = df[df['BF'] >= args.min_pf]
df.to_pickle("%s/%s.pickle" % (args.outpath, filename()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment