jvfe/wikidata_shex_checker.py

## wikidata_shex_checker.py
from pyshex.shex_evaluator import ShExEvaluator
from pyshex.user_agent import SlurpyGraphWithAgent
from pyshex.utils.sparql_query import SPARQLQuery
import pandas as pd


def test_query_against_shex(schema, sparql):
    """Checks the items resulting from a Wikidata query against a shex schema

    Using PyShEx, we can check the validity of multiple items against a predefined
    ShEx schema. Both inputs must come as strings, careful with line breaks at the end
    of the schema string, as that tends to break things.

    Args:
        schema (str): A ShEx schema to check the items against.
        sparql (str): A SPARQL query from where to obtain the items.

    Returns:
        DataFrame: A Pandas DataFrame with the results, three columns: The item itself,
            if it conforms to the schema and if not, why (reason).
    """
    endpoint = "https://query.wikidata.org/sparql"

    result_list = []
    for r in ShExEvaluator(
        SlurpyGraphWithAgent(endpoint),
        schema,
        SPARQLQuery(endpoint, sparql).focus_nodes(),
    ).evaluate():
        conforms = True if r.result else False
        result_list.append([r.focus, conforms, r.reason])

    result_df = pd.DataFrame.from_records(
        result_list, columns=["item", "conforms", "reason"]
    )
    return result_df
	from pyshex.shex_evaluator import ShExEvaluator
	from pyshex.user_agent import SlurpyGraphWithAgent
	from pyshex.utils.sparql_query import SPARQLQuery
	import pandas as pd


	def test_query_against_shex(schema, sparql):
	"""Checks the items resulting from a Wikidata query against a shex schema

	Using PyShEx, we can check the validity of multiple items against a predefined
	ShEx schema. Both inputs must come as strings, careful with line breaks at the end
	of the schema string, as that tends to break things.

	Args:
	schema (str): A ShEx schema to check the items against.
	sparql (str): A SPARQL query from where to obtain the items.

	Returns:
	DataFrame: A Pandas DataFrame with the results, three columns: The item itself,
	if it conforms to the schema and if not, why (reason).
	"""
	endpoint = "https://query.wikidata.org/sparql"

	result_list = []
	for r in ShExEvaluator(
	SlurpyGraphWithAgent(endpoint),
	schema,
	SPARQLQuery(endpoint, sparql).focus_nodes(),
	).evaluate():
	conforms = True if r.result else False
	result_list.append([r.focus, conforms, r.reason])

	result_df = pd.DataFrame.from_records(
	result_list, columns=["item", "conforms", "reason"]
	)
	return result_df