Skip to content

Instantly share code, notes, and snippets.

@stestagg
Last active May 22, 2020 06:46
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save stestagg/910859576f44f20e509822365414290d to your computer and use it in GitHub Desktop.
Save stestagg/910859576f44f20e509822365414290d to your computer and use it in GitHub Desktop.
import tubes
import numpy as np
from matplotlib import pyplot
import glob
from os import path
FILES = glob.glob(path.expanduser("~/src/data/ngrams/1gram/googlebooks*"))
PYTHON_MATCH = '0'
PASCAL_MATCH = '1'
PERL_MATCH = '2'
YEAR_COL = '3'
COUNT_COL = '4'
def main():
start_time = time.process_time()
one_grams_tube = (tubes.Each(FILES)
.read_files()
.split()
.tsv(headers=False)
.skip_unless(lambda row: row.get(1).to(int).gt(1799))
.multi(lambda row: (
row.get(0).equals("Python".encode('utf-8')),
row.get(0).equals("Pascal".encode('utf-8')),
row.get(0).equals("Perl".encode('utf-8')),
row.get(1).to(int),
row.get(2).to(int)
))
)
one_grams = one_grams_tube.ndarray(estimated_rows=500_000_000, fields=True)
year_totals, bins = np.histogram(
one_grams[YEAR_COL],
density=False,
range=(0, 2009),
bins=2009,
weights=one_grams[COUNT_COL])
pyplot.figure(figsize=(9,5))
pyplot.title(f"Relative published mention of Language by year", y=1.04)
pyplot.xlabel("Publication Year")
pyplot.ylabel("Relative mentions")
for lang, col in [("Python", PYTHON_MATCH), ("Perl", PERL_MATCH), ("Pascal", PASCAL_MATCH)]:
matching_rows = one_grams[one_grams[col] == True]
relative_counts = (matching_rows[COUNT_COL] * 100) / year_totals[matching_rows[YEAR_COL]]
baseline_rows = matching_rows[YEAR_COL] <= 1960
baseline = relative_counts[baseline_rows].mean()
adjusted_counts = relative_counts - baseline
pyplot.plot(matching_rows[YEAR_COL], adjusted_counts, label=lang)
pyplot.legend()
current_axes = pyplot.axis()
pyplot.axis([1950, 2009, -0.00001, current_axes[3]])
pyplot.grid(color="black", linewidth=0.3, linestyle='dotted')
pyplot.savefig("compare.png", dpi=300, transparent=True)
pyplot.show()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment