Skip to content

Instantly share code, notes, and snippets.

@mathigatti
Created June 9, 2019 12:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mathigatti/aa12d484ad545e909e48bfa080a11eae to your computer and use it in GitHub Desktop.
Save mathigatti/aa12d484ad545e909e48bfa080a11eae to your computer and use it in GitHub Desktop.
Normalized Google Distance in Python
import requests
from bs4 import BeautifulSoup
import math
import sys
def number_of_results(text):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
r = requests.get("https://www.google.com/search?q="+text.replace(" ","+"),params={"gl":"us"},headers=headers)
soup = BeautifulSoup(r.text, "lxml")
res = soup.find("div", {"id": "resultStats"})
print(res.text)
for t in res.text.split():
try:
number = float(t.replace(",",""))
print("{} results for {}".format(number,text))
return number
except:
pass
raise Exception("Couldn't find a valid number of results on Google")
# N = number_of_results("the")
N = 25270000000.0
N = math.log(N,2)
def normalized_google_distance(w1, w2):
f_w1 = math.log(number_of_results(w1),2)
f_w2 = math.log(number_of_results(w2),2)
f_w1_w2 = math.log(number_of_results(w1+" "+w2),2)
return (max(f_w1,f_w2) - f_w1_w2) / (N - min(f_w1,f_w2))
def main(argv):
w1 = argv[1]
w2 = argv[2]
score = normalized_google_distance(w1,w2)
print("Score is",round(score,2))
print("W1='"+ w1+ "' W2='"+ w2+ "'")
# Usage example
# python normalized_google_distance.py shakespeare macbeth
# python normalized_google_distance.py "shakespeare " "macbeth"
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment