Skip to content

Instantly share code, notes, and snippets.

@harej
Created June 27, 2020 18:43
Show Gist options
  • Save harej/9ce360452e4728eb1020bd89e35e1dec to your computer and use it in GitHub Desktop.
Save harej/9ce360452e4728eb1020bd89e35e1dec to your computer and use it in GitHub Desktop.
import requests
import json
import sys
from multiprocessing.dummy import Pool as ThreadPool
from wikidataintegrator import wdi_core, wdi_login
from wikidataintegrator.wdi_core import WDItemEngine
mediawiki_api_url = 'https://iagraph.wiki.opencura.com/w/api.php'
sparql_endpoint_url = 'https://iagraph.wiki.opencura.com/query/sparql'
login = wdi_login.WDLogin(
user='USERNAME',
pwd='PASSWORD',
mediawiki_api_url=mediawiki_api_url)
get_item = WDItemEngine.wikibase_item_engine_factory(mediawiki_api_url,
sparql_endpoint_url)
def remove_dupe_dicts(l):
list_of_strings = [
json.dumps(d, sort_keys=True)
for d in l
]
list_of_strings = set(list_of_strings)
return [
json.loads(s)
for s in list_of_strings
]
def run_dedupe(wb_id):
print('.', end='', flush=True)
try:
r = requests.get('https://iagraph.wiki.opencura.com/wiki/Special:EntityData/' + wb_id + '.json')
blob = r.json()['entities'][wb_id]
except:
return
edit_made = False
for prop_nr, list_of_claims in blob['claims'].items():
for num, claim in enumerate(list_of_claims):
original_references = claim['references']
blob['claims'][prop_nr][num]['references'] = remove_dupe_dicts(claim['references'])
if len(original_references) > len(blob['claims'][prop_nr][num]['references']):
edit_made = True
if edit_made == True:
# Deleting additional fields added by MediaWiki but not used in WikidataIntegrator
del blob['pageid']
del blob['ns']
del blob['title']
del blob['lastrevid']
del blob['modified']
del blob['type']
del blob['id']
item = get_item(wd_item_id=wb_id, new_item=False, global_ref_mode='STRICT_OVERWRITE')
item.wd_json_representation = blob
try:
new_wb_id = item.write(login)
print(new_wb_id, flush=True)
except Exception as e:
print(e, flush=True)
def main():
start = 1
finish = 180000
if sys.argv[-1][0] == 'Q':
run_dedupe(sys.argv[-1])
else:
pool = ThreadPool(15)
results = pool.map(run_dedupe, ['Q' + str(n) for n in range(start, finish)])
pool.close()
pool.join()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment