Last active
May 4, 2020 06:33
-
-
Save salgo60/ad0c0513343ed3a359cf5b2a8e220dd2 to your computer and use it in GitHub Desktop.
A test checking what is in Wikidata with what we have in SameAs Wikidata
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import traceback | |
import requests | |
import sys | |
from SPARQLWrapper import SPARQLWrapper, JSON | |
endpoint_url = "https://query.wikidata.org/sparql" | |
# SPARQL finding objects with Property P7704 Europeana https://w.wiki/PrV | |
query = """SELECT * WHERE { | |
?item wdt:P7704 ?E; | |
schema:dateModified ?edited. | |
} ORDER BY ?edited limit 1000000""" | |
def get_results(endpoint_url, query): | |
user_agent = "Europeana check user salgo60/%s.%s" % (sys.version_info[0], sys.version_info[1]) | |
# TODO adjust user agent; see https://w.wiki/CX6 | |
sparql = SPARQLWrapper(endpoint_url, agent=user_agent) | |
sparql.setQuery(query) | |
sparql.setReturnFormat(JSON) | |
return sparql.query().convert() | |
urlbase = 'http://data.europeana.eu/' | |
uribase = 'agent/base/' | |
def checkWDobject(Qnumber,currUri): | |
checkURL = "https://www.wikidata.org/wiki/Special:EntityData/" + Qnumber + ".json" | |
responseQikidata = requests.get(checkURL) | |
if responseQikidata.status_code == 404: | |
print(Qnumber, "|", currUri) | |
return False | |
return True | |
def checkSameAsWD(agent,wdobject): | |
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36' | |
s = requests.Session() | |
s.headers['User-Agent'] = user_agent | |
url = "https://api.europeana.eu/entity/" + agent + "?wskey=apidemo" | |
try: | |
responseEuropeana = s.get(url) | |
itemJson = responseEuropeana.json() | |
for z in itemJson["sameAs"]: | |
if "www.wikidata" in z: | |
checkQ = z.replace("http://www.wikidata.org/entity/","") | |
if checkQ == wdobject: | |
#print("\tSame as|", checkQ,wdobject, "|", agent) | |
pass | |
else: | |
print("\t\tNot same|", checkQ, wdobject,"|", agent) | |
return z | |
except: | |
traceback.print_exc() | |
print("Error ",url ) | |
return False | |
def checkEuropeanaobject(agentID,wdobject): | |
checkURL = urlbase + agentID | |
#print( checkURL) | |
try: | |
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36' | |
s = requests.Session() | |
s.headers['User-Agent'] = user_agent | |
responseEuropeana = s.head(checkURL,allow_redirects=True) | |
#print("url redirect ",responseEuropeana.url) | |
if "https://pro.europeana.eu/page/linked-open-data" != str(responseEuropeana.url): | |
wdEuropeana = checkSameAsWD(responseEuropeana.url.replace("https://api.europeana.eu/entity/","").replace("?wskey=apidemo","") | |
,wdobject) | |
except: | |
#print(s.status_code, s.headers['Location']) | |
print("Error\t", "|", agentID, "|", checkURL) | |
# print(responseEuropeana.json()) | |
return True | |
#checkEuropeanaobject("agent/base/2961") # deleted | |
#checkEuropeanaobject("agent/base/61829") # exist | |
#checkEuropeanaobject("Error") | |
results = get_results(endpoint_url, query) | |
for result in results["results"]["bindings"]: | |
#print(result) | |
#print (result["E"]["value"], result["item"]["value"]) | |
checkEuropeanaobject(result["E"]["value"],str(result["item"]["value"].replace("http://www.wikidata.org/entity/",""))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Looks like we find a lot of merges in Wikidata see also SPARQL merges the last 100 days https://w.wiki/Pq6
Part of Output GIST above
Not same| Q5511271 Q23656556 | agent/base/2614
Not same| Q7979118 Q2441379 | agent/base/2716
Not same| Q3181213 Q2171979 | agent/base/3795
Not same| Q7629453 Q3531426 | agent/base/9396
Not same| Q5367628 Q24248095 | agent/base/9601
Not same| Q7967026 Q6166318 | agent/base/10515
Not same| Q6715102 Q60829673 | agent/base/21114
Not same| Q4777759 Q5413313 | agent/base/21594
Not same| Q7453293 Q2272649 | agent/base/11403
Not same| Q15987259 Q11470302 | agent/base/22688
Not same| Q6150369 Q2275090 | agent/base/23158
Not same| Q6967256 Q12248500 | agent/base/14048
Not same| Q2743340 Q2127179 | agent/base/14801
Not same| Q4673233 Q417496 | agent/base/14872
Not same| Q16106593 Q5976344 | agent/base/15031
Not same| Q5421710 Q1384840 | agent/base/25443
Not same| Q7514048 Q6128285 | agent/base/25449
Not same| Q6409422 Q12587858 | agent/base/15429
Not same| Q4895729 Q3635436 | agent/base/16084
Not same| Q5767770 Q2227116 | agent/base/18753
Not same| Q6513044 Q1691960 | agent/base/29434
Not same| Q2534493 Q1882271 | agent/base/31177
Not same| Q7525637 Q1709034 | agent/base/31724
Not same| Q4766720 Q18572235 | agent/base/31882
Not same| Q7927657 Q4313152 | agent/base/33682
Not same| Q7668456 Q6137435 | agent/base/35089
Not same| Q6686054 Q18935384 | agent/base/35316
Not same| Q16767189 Q8201473 | agent/base/55860
Not same| Q4738777 Q6590899 | agent/base/56587
Not same| Q7295802 Q6081387 | agent/base/57755
Not same| Q16229726 Q1677234 | agent/base/57917
Not same| Q8074729 Q18410294 | agent/base/57939
Not same| Q7493223 Q13010551 | agent/base/37879
Not same| Q3303307 Q1687582 | agent/base/38276
Not same| Q5336353 Q4263381 | agent/base/59219
Not same| Q5307175 Q3715334 | agent/base/59456
Not same| Q17051593 Q9358032 | agent/base/42738
Not same| Q8015200 Q1418442 | agent/base/38872
Not same| Q5519557 Q6758466 | agent/base/43387
Not same| Q5563714 Q3766062 | agent/base/57346
Not same| Q7417120 Q1529927 | agent/base/43686
Not same| Q8051364 Q3784470 | agent/base/39035
Not same| Q7239084 Q13016624 | agent/base/39822
Not same| Q16918483 Q1640910 | agent/base/44595
Not same| Q5336723 Q20438429 | agent/base/44999
Not same| Q7638868 Q6472274 | agent/base/41352
Not same| Q15987797 Q6850242 | agent/base/47638
Not same| Q17314198 Q16783777 | agent/base/47849
Not same| Q5548322 Q5548319 | agent/base/47950
Not same| Q16769854 Q12725837 | agent/base/48231
Not same| Q5394253 Q42349141 | agent/base/48591
Not same| Q5214220 Q202613 | agent/base/66769
Not same| Q17060973 Q947713 | agent/base/54095
Not same| Q5368275 Q19599065 | agent/base/78317
Not same| Q6192823 Q4590620 | agent/base/91910
Not same| Q7129452 Q20561096 | agent/base/92185
Not same| Q1968828 Q1165276 | agent/base/94174
Not same| Q5340245 Q4912488 | agent/base/56145