zapalote/polars-terms.py

## polars-terms.py
# Data source: https://storage.googleapis.com/books/ngrams/books/datasetsv2.html

# extraction pattern: ngram TAB year TAB match_count TAB volume_count NEWLINE
# out: unique_ngram TAB sum(match_count) NEWLINE

import os, sys
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import freeze_support
import polars as pl
from humanfriendly import format_size
import time

t0 = 0
def eta(t=None):
    global t0
    if t is not None:
        t0 = time.time()
        return
    else:
        t1 = time.time()
        t = t1 - t0
        t0 = t1
        hours, rem = divmod(t, 3600)
        minutes, seconds = divmod(rem, 60)
        return("Ellapsed time {:0>2}:{:0>2}:{:06.3f}".format(int(hours),int(minutes),seconds))

def process_file(file):
    global base, stopwords
    not_word = r'(_|[^\w])'

    # print(f"processing {base+file}")
    df = pl.read_csv(base+file, sep="\t", columns=[0,2], new_columns=['word','count'])
    fsize = Path(base+file).stat().st_size
    print(f"{file} ({format_size(fsize)}), {len(df)} records")

    # filter out terms with non alphabetical characters ...
    df = df.filter(pl.col("word").str.contains(not_word).is_not())

    #  ... and terms shorter than 3 chars
    df = df.filter(pl.col("word").str.lengths() > 2)

    #  ... and stop words
    df["word"] = df["word"].str.to_lowercase()
    df = df.filter(pl.col("word").is_in(stopwords).is_not())

    # sum unique counts
    df = df.groupby('word')['count'].sum().sort(by='count_sum', reverse=True)

    #  select only terms that appear more 20,000 times in the books
    good = df.filter(pl.col("count_sum") > 20000)

    #  output a csv file
    print(f"out_{file}, {len(good)} terms")
    good.to_csv(f'out_{file}.csv', sep='\t', has_header=False)
    # df.filter(pl.col("count_sum") < 20000).to_csv(f'bad_{file}.csv', sep='\t', has_header=False)

base = "googlebooks-eng-all-1gram-20120701/googlebooks-eng-all-1gram-20120701-"
files = ['a','b','c','d','e','f','g','h','i','j',\
        'k','l','m','n','o','p','q','r','s',\
        't','u','v','w','x','y','z']

with open('stopwords.txt') as f:
  stopwords = f.read().splitlines()

def main():
    with ProcessPoolExecutor() as procs:
        p_res = procs.map(process_file, files)

def one():
    process_file('a')

# this is needed for the process pool to initialize properly
if __name__ == '__main__':
    eta(0)
    main()
    # one()
    print(eta())
	# Data source: https://storage.googleapis.com/books/ngrams/books/datasetsv2.html

	# extraction pattern: ngram TAB year TAB match_count TAB volume_count NEWLINE
	# out: unique_ngram TAB sum(match_count) NEWLINE

	import os, sys
	from pathlib import Path
	from concurrent.futures import ProcessPoolExecutor
	from multiprocessing import freeze_support
	import polars as pl
	from humanfriendly import format_size
	import time

	t0 = 0
	def eta(t=None):
	global t0
	if t is not None:
	t0 = time.time()
	return
	else:
	t1 = time.time()
	t = t1 - t0
	t0 = t1
	hours, rem = divmod(t, 3600)
	minutes, seconds = divmod(rem, 60)
	return("Ellapsed time {:0>2}:{:0>2}:{:06.3f}".format(int(hours),int(minutes),seconds))

	def process_file(file):
	global base, stopwords
	not_word = r'(_\|[^\w])'

	# print(f"processing {base+file}")
	df = pl.read_csv(base+file, sep="\t", columns=[0,2], new_columns=['word','count'])
	fsize = Path(base+file).stat().st_size
	print(f"{file} ({format_size(fsize)}), {len(df)} records")

	# filter out terms with non alphabetical characters ...
	df = df.filter(pl.col("word").str.contains(not_word).is_not())

	# ... and terms shorter than 3 chars
	df = df.filter(pl.col("word").str.lengths() > 2)

	# ... and stop words
	df["word"] = df["word"].str.to_lowercase()
	df = df.filter(pl.col("word").is_in(stopwords).is_not())

	# sum unique counts
	df = df.groupby('word')['count'].sum().sort(by='count_sum', reverse=True)

	# select only terms that appear more 20,000 times in the books
	good = df.filter(pl.col("count_sum") > 20000)

	# output a csv file
	print(f"out_{file}, {len(good)} terms")
	good.to_csv(f'out_{file}.csv', sep='\t', has_header=False)
	# df.filter(pl.col("count_sum") < 20000).to_csv(f'bad_{file}.csv', sep='\t', has_header=False)

	base = "googlebooks-eng-all-1gram-20120701/googlebooks-eng-all-1gram-20120701-"
	files = ['a','b','c','d','e','f','g','h','i','j',\
	'k','l','m','n','o','p','q','r','s',\
	't','u','v','w','x','y','z']

	with open('stopwords.txt') as f:
	stopwords = f.read().splitlines()

	def main():
	with ProcessPoolExecutor() as procs:
	p_res = procs.map(process_file, files)

	def one():
	process_file('a')

	# this is needed for the process pool to initialize properly
	if __name__ == '__main__':
	eta(0)
	main()
	# one()
	print(eta())