Skip to content

Instantly share code, notes, and snippets.

@harej
Last active June 9, 2016 21:20
Show Gist options
  • Save harej/1ab12a75f8ed4e755af2 to your computer and use it in GitHub Desktop.
Save harej/1ab12a75f8ed4e755af2 to your computer and use it in GitHub Desktop.
Generates list of items and properties used on NPG-related Wikidata entries and assesses existence of labels in other languages
# Step 1: Get list of any Wikidata item with NPG ID and anything that is a subclass of chemical hazard
# Step 2: Iterate through each item for invoked items and properties
# (for claim in claims; for subclaim in claim: 'Q' + str(subclaim['mainsnak']['data-value']['value']['numeric-id'])
# and subclaim['mainsnak']['property'] where claim[0]['datatype'] == 'wikibase-item')
# Step 3: De-duplicate to generate exhaustive list of each item/property of interest to NIOSH
# Step 4: Check labels: en, es, zh, fr, de
# Step 5: Prepare HTML table that lists each item/property of interest, highlighting cells where values are missing
# Step 6: Take percentages of coverage in each language; save to a timestamped log
import requests
def wdqs(encoded_query):
# Takes URL-encoded SPARQL query for the Wikidata Query Service
# Returns list of Wikidata items
base_url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql?query={0}&format=json"
r = requests.get(base_url.format(encoded_query))
blob = r.json()
output = []
for item in blob['results']['bindings']:
cleaned_value = item['item']['value'].replace("http://www.wikidata.org/entity/", "")
output.append(cleaned_value)
return output
def entitydata(identifier):
# Takes Wikidata identifier
# Returns dictionary based on JSON blob from Special:EntityData
def linked_on_page(blob):
# Takes EntityData dictionary and returns list of items and properties linked on a Wikidata item
def other_language_labels(blob, language_codes):
# Takes EntityData dictionary, list of ISO language codes (e.g. ['en', 'de'])
# Returns dictionary of language code -> label (or language code -> None)
def gap_analysis(manifest):
# Takes a dictionary of dictionaries {item -> {language: label}}
# Returns a dictionary of language -> percent covered
def web_page_generator(manifest):
# Takes a dictionary of dictionaries {item -> {language: label}}
# Returns nothing; creates two web pages
def main():
language_codes = ['en', 'es', 'zh', 'fr', 'de']
print("Querying for list of chemical/exposure items...")
chemicals_and_exposures_query = # TODO: put URL-encoded query here
chemicals_and_exposures = wdqs(chemicals_and_exposures_query)
master_list = {}
for item in chemicals_and_exposure:
print("Processing chemical/exposure item: " + item)
blob = entitydata(item)
chemicals_and_exposure_labels = other_language_labels(blob, language_codes)
for entry in chemicals_and_exposure_labels:
master_list[item] = entry
for link in linked_on_page(blob):
if link in master_list:
continue
else:
print("Processing linked entity: " + link)
labels = other_language_labels(entitydata(link), language_codes)
for entry in labels:
master_list[link] = entry
master_list = list(set(master_list)) # just in case
gap_report = gap_analysis(master_list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment