Skip to content

Instantly share code, notes, and snippets.

@salgo60
Last active April 21, 2020 10:18
Show Gist options
  • Save salgo60/d25e19626bf72693e0ba1baf6cb1560e to your computer and use it in GitHub Desktop.
Save salgo60/d25e19626bf72693e0ba1baf6cb1560e to your computer and use it in GitHub Desktop.
If we have sv:WIkipedia article linking SBL and compare with Wikidata records with P3217 but no sv:WIkipedia article
#Find duplicate SBL WD records .....
# SBL but no sv article https://w.wiki/NUT
#Version 0.2 add blacklist
__version__ = "0.2"
__author__ = "Magnus Sälgö"
print ("version: ",__version__)
import requests
from bs4 import BeautifulSoup
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
endpoint_url = "https://query.wikidata.org/sparql"
query = """SELECT ?item ?SBL
WHERE {
?item wdt:P3217 ?SBL.
?item wdt:P31 wd:Q5.
minus { ?languagelink schema:about ?item.
?languagelink schema:inLanguage "sv"}
SERVICE wikibase:label { bd:serviceParam wikibase:language "sv" }
} """
noWikipediaArticleList = []
def get_results(endpoint_url, query):
user_agent = "CheckSBLduplicates/%s.%s" % (sys.version_info[0], sys.version_info[1])
sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
return sparql.query().convert()
results = get_results(endpoint_url, query)
for result in results["results"]["bindings"]:
SBLvalue= (result["SBL"]["value"])
#print(SBLvalue)
noWikipediaArticleList.append(SBLvalue);
urls = ["https://sv.wikipedia.org/w/index.php?title=Special:L%C3%A4nks%C3%B6kning&limit=7000&offset=0&target=http%3A%2F%2Fsok.riksarkivet.se%2Fsbl%2FPresentation.aspx%3Fid%3D",
"https://sv.wikipedia.org/w/index.php?title=Special:L%C3%A4nks%C3%B6kning&limit=5000&offset=5000&target=http%3A%2F%2Fsok.riksarkivet.se%2Fsbl%2FPresentation.aspx%3Fid%3D"]
searchURLWikipedia = "https://sv.wikipedia.org/w/index.php?title=Special%3AL%C3%A4nks%C3%B6kning&target=http%3A%2F%2Fsok.riksarkivet.se%2Fsbl%2FPresentation.aspx%3Fid%3D"
searchURLWikidata = "https://www.wikidata.org/wiki/Special:Search?fulltext=S%C3%B6k+efter+%2710922%27&fulltext=Search&ns0=1&ns120=1&search=haswbstatement%3AP3217%3D"
nr_found = 0
blackList = ["18073"] # example page Malldiskussion:SBL
blackList.append("16788") # släktartikel
blackList.append("19122") # släktartikel
blackList.append("5552") # släktartikel
blackList.append("5910") # diskussionssida
blackList.append("7585") # Malldiskussion:SBL
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
soup.findAll('a')
one_a_tag = soup.findAll('a')[0]
for i in range(1, len(soup.findAll('a'))):
one_a_tag = soup.findAll('a')[i]
one_a_tag_href = one_a_tag['href']
if str("sok.riksarkivet.se/sbl/Presentation.aspx?id=").lower() in str(one_a_tag_href).lower():
SBLid = one_a_tag.text.replace("http://sok.riksarkivet.se/sbl/Presentation.aspx?id=","")
if (SBLid in noWikipediaArticleList) and (SBLid not in blackList):
print ("SBL: " + str(SBLid))
print ("\t", searchURLWikidata + str(SBLid), " ")
print ("\t", searchURLWikipedia + str(SBLid), " ")
nr_found = nr_found + 1
print(f'Nr possible duplicates found: {nr_found}')
@salgo60
Copy link
Author

salgo60 commented Apr 21, 2020

/usr/local/bin/python3.8 /Users/magnus/Library/Preferences/PyCharmCE2019.2/scratches/scratch_42.py
version: 0.2
Nr possible duplicates found: 0

Process finished with exit code 0

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment