Skip to content

Instantly share code, notes, and snippets.

@JaimieMurdock
Created September 3, 2016 16:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JaimieMurdock/6ee414dd87e07d6558a936661fa83a5d to your computer and use it in GitHub Desktop.
Save JaimieMurdock/6ee414dd87e07d6558a936661fa83a5d to your computer and use it in GitHub Desktop.
import csv
import wikipedia
from wikipedia.exceptions import PageError, DisambiguationError, RedirectError
from requests.exceptions import ConnectionError
import sys
import time
ISIS_URL = "http://data.isiscb.org/isis/authority/{}/"
names = dict()
with open(sys.argv[-1], 'rU') as infile:
csvfile = csv.reader(infile, delimiter='\t')
for row in csvfile:
name = ' '.join(row[1].split(',', 1)[::-1])
names[row[0]] = name.strip()
print len(names)
found_first = 0
found = 0
from codecs import open
with open('entities.tsv','a', encoding='utf-8') as outfile:
for i, id_name in enumerate(names.items()[51158:]):
id, name = id_name
try:
result = wikipedia.page(name)
outfile.write(u"\t".join([ISIS_URL.format(id), result.url,
name.decode('utf-8'), str(i + 51158)]) +'\n')
found +=1
except (PageError, DisambiguationError, RedirectError):
# print name, '\t', wikipedia.search(name)
pass
except ConnectionError:
time.sleep(300)
"""
results = wikipedia.search(name)
if results and name != results[0]:
print name, '\t', results[0]
for result in results:
if name == result:
print '\t', "found them!"
found += 1
else:
found += 1
found_first += 1
"""
time.sleep(0.75)
print "Found {} entries unambiguously, resolved {}.".format(found_first, found)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment