Created
September 3, 2016 16:22
-
-
Save JaimieMurdock/6ee414dd87e07d6558a936661fa83a5d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import wikipedia | |
from wikipedia.exceptions import PageError, DisambiguationError, RedirectError | |
from requests.exceptions import ConnectionError | |
import sys | |
import time | |
ISIS_URL = "http://data.isiscb.org/isis/authority/{}/" | |
names = dict() | |
with open(sys.argv[-1], 'rU') as infile: | |
csvfile = csv.reader(infile, delimiter='\t') | |
for row in csvfile: | |
name = ' '.join(row[1].split(',', 1)[::-1]) | |
names[row[0]] = name.strip() | |
print len(names) | |
found_first = 0 | |
found = 0 | |
from codecs import open | |
with open('entities.tsv','a', encoding='utf-8') as outfile: | |
for i, id_name in enumerate(names.items()[51158:]): | |
id, name = id_name | |
try: | |
result = wikipedia.page(name) | |
outfile.write(u"\t".join([ISIS_URL.format(id), result.url, | |
name.decode('utf-8'), str(i + 51158)]) +'\n') | |
found +=1 | |
except (PageError, DisambiguationError, RedirectError): | |
# print name, '\t', wikipedia.search(name) | |
pass | |
except ConnectionError: | |
time.sleep(300) | |
""" | |
results = wikipedia.search(name) | |
if results and name != results[0]: | |
print name, '\t', results[0] | |
for result in results: | |
if name == result: | |
print '\t', "found them!" | |
found += 1 | |
else: | |
found += 1 | |
found_first += 1 | |
""" | |
time.sleep(0.75) | |
print "Found {} entries unambiguously, resolved {}.".format(found_first, found) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment