Created
March 18, 2016 06:03
-
-
Save organisciak/7d7fbdd0c78e2f56fa9f to your computer and use it in GitHub Desktop.
Calculate frequencies in many books
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from htrc_features import FeatureReader | |
import argparse | |
import pandas as pd | |
import numpy as np | |
import random | |
import string | |
def main(): | |
parser = argparse.ArgumentParser(description='Calculate Collection ' | |
'Frequency, Page Frequency, and Book ' | |
'Frequency for a set of EF files.') | |
parser.add_argument('--outpath', type=str, default='pickles', | |
help='Directory to save pickles to.') | |
parser.add_argument('files', type=str, nargs='+', | |
help='list of bzip\'d EF data files') | |
args = parser.parse_args() | |
df = mapped_dfs(args.files) | |
# Save DF to a randomly generated pickle | |
filename = ''.join(random.choice(string.ascii_uppercase + string.digits) | |
for _ in range(8)) | |
df.to_pickle("%s/%s.pickle" % (args.outpath, filename)) | |
def get_freq_stats(vol): | |
tf = vol.tokenlist(pos=False) | |
tf.index = tf.index.droplevel(1).droplevel(0) | |
# Calculate collection frequency (for this book), page frequency, | |
# and book frequency | |
grouped = tf.groupby('token', level=0) | |
return grouped['count'].agg({'CF': np.sum, 'PF': len, 'BF': lambda x: 1}) | |
def mapped_dfs(paths): | |
fr = FeatureReader(paths) | |
all_df = [] | |
for vol in fr.volumes(): | |
try: | |
all_df.append(get_freq_stats(vol)) | |
except: | |
continue | |
return pd.concat(all_df) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import argparse | |
import random | |
import string | |
def filename(N=10): | |
return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(N)) | |
parser = argparse.ArgumentParser(description='Merge token frequency pickles.') | |
parser.add_argument('paths', metavar='paths', nargs='+', | |
help='Paths of DF pickles') | |
parser.add_argument('--outpath', type=str, default='pickles2', | |
help='Where to save the combined DF pickle') | |
parser.add_argument('--min-pf', type=int, default=1, | |
help='Filter any terms that occur on fewer than a specified number of pages.') | |
parser.add_argument('--min-bf', type=int, default=1, | |
help='Filter any terms that occur in fewer than a specified number of books.') | |
args = parser.parse_args() | |
dfs = [] | |
for path in args.paths: | |
try: | |
dfs.append(pd.read_pickle(path)) | |
except: | |
print "error with %s" % path | |
df = pd.concat(dfs).groupby(['token'], level=0).sum() | |
if args.min_pf > 1: | |
df = df[df['PF'] >= args.min_pf] | |
if args.min_bf > 1: | |
df = df[df['BF'] >= args.min_pf] | |
df.to_pickle("%s/%s.pickle" % (args.outpath, filename())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment