Created January 6, 2022 14:09
# Data source:
# extraction pattern: ngram TAB year TAB match_count TAB volume_count NEWLINE
# out: unique_ngram TAB sum(match_count) NEWLINE
import os, sys
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import freeze_support
import polars as pl
from humanfriendly import format_size
import time
t0 = 0
def eta(t=None):
global t0
if t is not None:
t0 = time.time()
t1 = time.time()
t = t1 - t0
t0 = t1
hours, rem = divmod(t, 3600)
minutes, seconds = divmod(rem, 60)
return("Ellapsed time {:0>2}:{:0>2}:{:06.3f}".format(int(hours),int(minutes),seconds))
def process_file(file):
global base, stopwords
not_word = r'(_|[^\w])'
# print(f"processing {base+file}")
df = pl.read_csv(base+file, sep="\t", columns=[0,2], new_columns=['word','count'])
fsize = Path(base+file).stat().st_size
print(f"{file} ({format_size(fsize)}), {len(df)} records")
# filter out terms with non alphabetical characters ...
df = df.filter(pl.col("word").str.contains(not_word).is_not())
# ... and terms shorter than 3 chars
df = df.filter(pl.col("word").str.lengths() > 2)
# ... and stop words
df["word"] = df["word"].str.to_lowercase()
df = df.filter(pl.col("word").is_in(stopwords).is_not())
# sum unique counts
df = df.groupby('word')['count'].sum().sort(by='count_sum', reverse=True)
# select only terms that appear more 20,000 times in the books
good = df.filter(pl.col("count_sum") > 20000)
# output a csv file
print(f"out_{file}, {len(good)} terms")
good.to_csv(f'out_{file}.csv', sep='\t', has_header=False)
# df.filter(pl.col("count_sum") < 20000).to_csv(f'bad_{file}.csv', sep='\t', has_header=False)
base = "googlebooks-eng-all-1gram-20120701/googlebooks-eng-all-1gram-20120701-"
files = ['a','b','c','d','e','f','g','h','i','j',\
with open('stopwords.txt') as f:
stopwords =
def main():
with ProcessPoolExecutor() as procs:
p_res =, files)
def one():
# this is needed for the process pool to initialize properly
if __name__ == '__main__':
# one()
