Skip to content

Instantly share code, notes, and snippets.

@zapalote
Created January 6, 2022 14:09
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zapalote/8c66e7c46bc863b79c423acfc604af11 to your computer and use it in GitHub Desktop.
Save zapalote/8c66e7c46bc863b79c423acfc604af11 to your computer and use it in GitHub Desktop.
# Data source: https://storage.googleapis.com/books/ngrams/books/datasetsv2.html
# extraction pattern: ngram TAB year TAB match_count TAB volume_count NEWLINE
# out: unique_ngram TAB sum(match_count) NEWLINE
import os, sys
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import freeze_support
import polars as pl
from humanfriendly import format_size
import time
t0 = 0
def eta(t=None):
global t0
if t is not None:
t0 = time.time()
return
else:
t1 = time.time()
t = t1 - t0
t0 = t1
hours, rem = divmod(t, 3600)
minutes, seconds = divmod(rem, 60)
return("Ellapsed time {:0>2}:{:0>2}:{:06.3f}".format(int(hours),int(minutes),seconds))
def process_file(file):
global base, stopwords
not_word = r'(_|[^\w])'
# print(f"processing {base+file}")
df = pl.read_csv(base+file, sep="\t", columns=[0,2], new_columns=['word','count'])
fsize = Path(base+file).stat().st_size
print(f"{file} ({format_size(fsize)}), {len(df)} records")
# filter out terms with non alphabetical characters ...
df = df.filter(pl.col("word").str.contains(not_word).is_not())
# ... and terms shorter than 3 chars
df = df.filter(pl.col("word").str.lengths() > 2)
# ... and stop words
df["word"] = df["word"].str.to_lowercase()
df = df.filter(pl.col("word").is_in(stopwords).is_not())
# sum unique counts
df = df.groupby('word')['count'].sum().sort(by='count_sum', reverse=True)
# select only terms that appear more 20,000 times in the books
good = df.filter(pl.col("count_sum") > 20000)
# output a csv file
print(f"out_{file}, {len(good)} terms")
good.to_csv(f'out_{file}.csv', sep='\t', has_header=False)
# df.filter(pl.col("count_sum") < 20000).to_csv(f'bad_{file}.csv', sep='\t', has_header=False)
base = "googlebooks-eng-all-1gram-20120701/googlebooks-eng-all-1gram-20120701-"
files = ['a','b','c','d','e','f','g','h','i','j',\
'k','l','m','n','o','p','q','r','s',\
't','u','v','w','x','y','z']
with open('stopwords.txt') as f:
stopwords = f.read().splitlines()
def main():
with ProcessPoolExecutor() as procs:
p_res = procs.map(process_file, files)
def one():
process_file('a')
# this is needed for the process pool to initialize properly
if __name__ == '__main__':
eta(0)
main()
# one()
print(eta())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment