Skip to content

Instantly share code, notes, and snippets.

@ZhangChengX
Created March 14, 2022 21:22
Show Gist options
  • Save ZhangChengX/da0e3fa7ae56e8e2bdd5ffa28f95a42b to your computer and use it in GitHub Desktop.
Save ZhangChengX/da0e3fa7ae56e8e2bdd5ffa28f95a42b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*
# Reference
# https://github.com/zslwyuan/google-ngrams/blob/master/getngrams.py
from nltk.corpus import words
from ast import literal_eval
from tqdm import tqdm
import requests
import time
import re
corpora = dict(eng_us_2012=17, eng_us_2009=5, eng_us_2019=28,
eng_gb_2012=18, eng_gb_2009=6, eng_gb_2019=26,
chi_sim_2019=34, chi_sim_2012=23, chi_sim_2009=11,
eng_2012=15, eng_2009=0,
eng_fiction_2012=16, eng_fiction_2009=4, eng_1m_2009=1,
fre_2019=30, fre_2012=19, fre_2009=7,
ger_2019=31, ger_2012=20, ger_2009=8,
heb_2012=24,
heb_2009=9,
spa_2019=32, spa_2012=21, spa_2009=10,
rus_2019=36, rus_2012=25, rus_2009=12,
ita_2019=33, ita_2012=22)
def get_ngrams(query, corpus, startYear, endYear, smoothing=3, caseInsensitive=False):
params = dict(content=query, year_start=startYear, year_end=endYear,
corpus=corpora[corpus], smoothing=smoothing,
case_insensitive=caseInsensitive)
if params['case_insensitive'] is False:
params.pop('case_insensitive')
if '?' in params['content']:
params['content'] = params['content'].replace('?', '*')
if '@' in params['content']:
params['content'] = params['content'].replace('@', '=>')
try_again = True
while try_again:
req = requests.get('http://books.google.com/ngrams/graph', params=params)
res = re.findall('ngrams.data = .*\];', req.text)
if len(res)==1:
try_again = False
else:
print('Try again: ', query)
if 'Please try again later.' == req.text:
print('Try again error')
else:
print('Unknown error ', res)
time.sleep(300)
# assert(len(res)==1)
data = None
if res:
dataDict = literal_eval(res[0].replace("ngrams.data = ", "").replace(";", ""))
data = {qry['ngram']: qry['timeseries'] for qry in dataDict}
return data
if __name__ == '__main__':
d = {}
# for word in tqdm(['the', 'he', 'Cheng', 'book']):
for word in tqdm(words.words()):
print('Looking up: ', word)
ngrams = get_ngrams(query=word, corpus='eng_us_2019', startYear=2000, endYear=2019, caseInsensitive=True)
# print(ngrams)
if not ngrams:
d[word] = 1000
print(ngrams)
continue
if word in ngrams:
ngram_score = sum(ngrams[word]) / len(ngrams[word])
d[word] = round(ngram_score * 100, 6)
else:
d[word] = 999
print(ngrams)
time.sleep(30)
with open('google_ngrims.csv', 'w') as f:
for k, v in d.items():
f.write("%s, %s\n" % (k, v))
print('Done')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment