zapalote/extract-gbooks-terms.py

## extract-gbooks-terms.py
# extraction pattern: ngram TAB year TAB match_count TAB volume_count NEWLINE
# out: unique_ngram TAB sum(match_count) NEWLINE

import re
import os, sys, mmap
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

abv = re.compile(r'^(([A-Z]\.){1,})(_|[^\w])')  # A.B.C.
word = re.compile(r'^(([a-zA-Z]){1,})(_|[^\w])')  # ABC | Abc | abc

base = "googlebooks-eng-all-1gram-20120701-"
files = ['a','b','c','d','e','f','g','h','i','j',\
        'k','l','m','n','o','p','q','r','s',\
        't','u','v','w','x','y','z']

def process_file(file):
    global base
    vocab = {}
    fsize = Path(base+file).stat().st_size
    tot = 0
    print(f"processing {base+file}")
    with open(base+file, "r+b") as fp:
        # use a progress bar
        with tqdm(total=fsize, desc=file) as pbar:
            # map the entire file into memory, normally much faster than buffered i/o
            mm = mmap.mmap(fp.fileno(), 0)
            # iterate over the block, until next newline
            for line in iter(mm.readline, b""):
                t = ''
                # convert the bytes to a utf-8 string and split the fields
                term = line.decode("utf-8").split("\t")
                # catch patterns such as A.B.C. (old-style abbreviations)
                m_abv = abv.findall(term[0])
                if m_abv:
                    # remove punctuation
                    t = re.sub(r'[^\w]', '', m_abv.group(1))
                else:
                    m_word = word.findall(term[0])
                    if m_word:
                        t = m_word.group(1)
                # add it to dictionary if not yet included and add its match_count
                if t in vocab:
                    vocab[t] += int(term[2])
                else:
                    vocab[t] = int(term[2])
                # update the progress bar
                tot += len(line)
                pbar.update(tot - pbar.n)
            mm.close()
    fp.close()
    # output vocabulary and counts to csv file
    outf = "gbooks-en-" + file + ".csv"
    with open(outf, "a+") as fp:
        for term in vocab:
            fp.write(f"{term}\t{vocab[term]}\n")
    fp.close()

# use as many threads as possible, default: min(32, os.cpu_count()+4)
with ThreadPoolExecutor() as executor:
    result = executor.map(process_file, files)
	# extraction pattern: ngram TAB year TAB match_count TAB volume_count NEWLINE
	# out: unique_ngram TAB sum(match_count) NEWLINE

	import re
	import os, sys, mmap
	from pathlib import Path
	from tqdm import tqdm
	from concurrent.futures import ThreadPoolExecutor

	abv = re.compile(r'^(([A-Z]\.){1,})(_\|[^\w])') # A.B.C.
	word = re.compile(r'^(([a-zA-Z]){1,})(_\|[^\w])') # ABC \| Abc \| abc

	base = "googlebooks-eng-all-1gram-20120701-"
	files = ['a','b','c','d','e','f','g','h','i','j',\
	'k','l','m','n','o','p','q','r','s',\
	't','u','v','w','x','y','z']

	def process_file(file):
	global base
	vocab = {}
	fsize = Path(base+file).stat().st_size
	tot = 0
	print(f"processing {base+file}")
	with open(base+file, "r+b") as fp:
	# use a progress bar
	with tqdm(total=fsize, desc=file) as pbar:
	# map the entire file into memory, normally much faster than buffered i/o
	mm = mmap.mmap(fp.fileno(), 0)
	# iterate over the block, until next newline
	for line in iter(mm.readline, b""):
	t = ''
	# convert the bytes to a utf-8 string and split the fields
	term = line.decode("utf-8").split("\t")
	# catch patterns such as A.B.C. (old-style abbreviations)
	m_abv = abv.findall(term[0])
	if m_abv:
	# remove punctuation
	t = re.sub(r'[^\w]', '', m_abv.group(1))
	else:
	m_word = word.findall(term[0])
	if m_word:
	t = m_word.group(1)
	# add it to dictionary if not yet included and add its match_count
	if t in vocab:
	vocab[t] += int(term[2])
	else:
	vocab[t] = int(term[2])
	# update the progress bar
	tot += len(line)
	pbar.update(tot - pbar.n)
	mm.close()
	fp.close()
	# output vocabulary and counts to csv file
	outf = "gbooks-en-" + file + ".csv"
	with open(outf, "a+") as fp:
	for term in vocab:
	fp.write(f"{term}\t{vocab[term]}\n")
	fp.close()

	# use as many threads as possible, default: min(32, os.cpu_count()+4)
	with ThreadPoolExecutor() as executor:
	result = executor.map(process_file, files)