stestagg/pytubes_compare.py

## pytubes_compare.py
import tubes
import numpy as np
from matplotlib import pyplot
import glob
from os import path

FILES = glob.glob(path.expanduser("~/src/data/ngrams/1gram/googlebooks*"))

PYTHON_MATCH = '0'
PASCAL_MATCH = '1'
PERL_MATCH = '2'
YEAR_COL = '3'
COUNT_COL = '4'

def main():
    start_time =  time.process_time()
    one_grams_tube = (tubes.Each(FILES)
        .read_files()
        .split()
        .tsv(headers=False)
        .skip_unless(lambda row: row.get(1).to(int).gt(1799))
        .multi(lambda row: (
            row.get(0).equals("Python".encode('utf-8')),
            row.get(0).equals("Pascal".encode('utf-8')),
            row.get(0).equals("Perl".encode('utf-8')),
            row.get(1).to(int),
            row.get(2).to(int)
        ))
    )

    one_grams = one_grams_tube.ndarray(estimated_rows=500_000_000, fields=True)

    year_totals, bins = np.histogram(
        one_grams[YEAR_COL],
        density=False,
        range=(0, 2009),
        bins=2009,
        weights=one_grams[COUNT_COL])

    pyplot.figure(figsize=(9,5))
    pyplot.title(f"Relative published mention of Language by year", y=1.04)
    pyplot.xlabel("Publication Year")
    pyplot.ylabel("Relative mentions")

    for lang, col in [("Python", PYTHON_MATCH), ("Perl", PERL_MATCH), ("Pascal", PASCAL_MATCH)]:
        matching_rows = one_grams[one_grams[col] == True]
        relative_counts = (matching_rows[COUNT_COL] * 100) / year_totals[matching_rows[YEAR_COL]]
        baseline_rows = matching_rows[YEAR_COL] <= 1960
        baseline = relative_counts[baseline_rows].mean()
        adjusted_counts = relative_counts - baseline
        pyplot.plot(matching_rows[YEAR_COL], adjusted_counts, label=lang)

    pyplot.legend()
    current_axes = pyplot.axis()
    pyplot.axis([1950, 2009, -0.00001, current_axes[3]])
    pyplot.grid(color="black", linewidth=0.3, linestyle='dotted')
    pyplot.savefig("compare.png", dpi=300, transparent=True)
    pyplot.show()


if __name__ == '__main__':
    main()
	import tubes
	import numpy as np
	from matplotlib import pyplot
	import glob
	from os import path

	FILES = glob.glob(path.expanduser("~/src/data/ngrams/1gram/googlebooks*"))

	PYTHON_MATCH = '0'
	PASCAL_MATCH = '1'
	PERL_MATCH = '2'
	YEAR_COL = '3'
	COUNT_COL = '4'

	def main():
	start_time = time.process_time()
	one_grams_tube = (tubes.Each(FILES)
	.read_files()
	.split()
	.tsv(headers=False)
	.skip_unless(lambda row: row.get(1).to(int).gt(1799))
	.multi(lambda row: (
	row.get(0).equals("Python".encode('utf-8')),
	row.get(0).equals("Pascal".encode('utf-8')),
	row.get(0).equals("Perl".encode('utf-8')),
	row.get(1).to(int),
	row.get(2).to(int)
	))
	)

	one_grams = one_grams_tube.ndarray(estimated_rows=500_000_000, fields=True)

	year_totals, bins = np.histogram(
	one_grams[YEAR_COL],
	density=False,
	range=(0, 2009),
	bins=2009,
	weights=one_grams[COUNT_COL])

	pyplot.figure(figsize=(9,5))
	pyplot.title(f"Relative published mention of Language by year", y=1.04)
	pyplot.xlabel("Publication Year")
	pyplot.ylabel("Relative mentions")

	for lang, col in [("Python", PYTHON_MATCH), ("Perl", PERL_MATCH), ("Pascal", PASCAL_MATCH)]:
	matching_rows = one_grams[one_grams[col] == True]
	relative_counts = (matching_rows[COUNT_COL] * 100) / year_totals[matching_rows[YEAR_COL]]
	baseline_rows = matching_rows[YEAR_COL] <= 1960
	baseline = relative_counts[baseline_rows].mean()
	adjusted_counts = relative_counts - baseline
	pyplot.plot(matching_rows[YEAR_COL], adjusted_counts, label=lang)

	pyplot.legend()
	current_axes = pyplot.axis()
	pyplot.axis([1950, 2009, -0.00001, current_axes[3]])
	pyplot.grid(color="black", linewidth=0.3, linestyle='dotted')
	pyplot.savefig("compare.png", dpi=300, transparent=True)
	pyplot.show()


	if __name__ == '__main__':
	main()