Last active
August 6, 2020 21:32
-
-
Save jvfe/fad021e45e8f93e2670e1d582b76a23c to your computer and use it in GitHub Desktop.
A few utilities to easily get data from Wikidata into a dataframe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
import pandas as pd | |
import requests | |
def perform_query(query): | |
"""Performs a SPARQL query to the wikidata endpoint | |
Args: | |
query: A string containing a functional sparql query | |
Returns: | |
A json with the response content. | |
""" | |
endpoint_url = "https://query.wikidata.org/sparql" | |
try: | |
response = requests.get( | |
endpoint_url, | |
params={"query": query}, | |
headers={"Accept": "application/sparql-results+json"}, | |
) | |
response.raise_for_status() | |
except requests.exceptions.HTTPError as err: | |
print(err) | |
else: | |
raw_results = response.json() | |
return raw_results | |
def parse_query_results(query_result, field_list): | |
"""Parse wikidata query results into a nice dataframe | |
Args: | |
query_result: A json dict with the results from the query | |
field_list: A list of the fields from the response you want in your final dataframe. | |
e.g. if your query selects "?item" and "?itemLabel" you may want ["item", "itemLabel"] | |
as your list. | |
Returns: | |
A pandas dataframe with a column for each component from field_list. | |
""" | |
parsed_results = defaultdict(list) | |
for q_r in query_result["results"]["bindings"]: | |
for item in field_list: | |
parsed_results[item].append(q_r[item]["value"]) | |
results_df = pd.DataFrame.from_dict(parsed_results).replace( | |
{"http://www.wikidata.org/entity/": ""}, regex=True | |
) | |
return results_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment