Skip to content

Instantly share code, notes, and snippets.

@xflr6
Last active May 22, 2022 09:57
Show Gist options
  • Save xflr6/2c6ddf6cda55313bc8d4fbdc65a88ca4 to your computer and use it in GitHub Desktop.
Save xflr6/2c6ddf6cda55313bc8d4fbdc65a88ca4 to your computer and use it in GitHub Desktop.
Read pandas.DataFrame from SPARQL query
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "1d935d67-aab3-49d1-b0e3-d82acd9e3329",
"metadata": {},
"source": [
"# pandas `read_sparql_query()`"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "5ebd1ea3-c44f-4b56-9ab9-58f58620f643",
"metadata": {},
"outputs": [],
"source": [
"from __future__ import annotations\n",
"\n",
"import datetime\n",
"import decimal\n",
"import distutils.util\n",
"import functools\n",
"import io\n",
"import logging\n",
"import types\n",
"import urllib.parse\n",
"import urllib.request\n",
"import warnings\n",
"import xml.etree.ElementTree as etree\n",
"\n",
"import pandas as pd\n",
"import rdflib\n",
"import SPARQLWrapper as sparqlwrapper"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "01c3b56a-7318-4999-8a5c-a2464921bbbb",
"metadata": {},
"outputs": [],
"source": [
"ENDPOINT = 'https://query.wikidata.org/sparql'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "32fcfb65-45d5-4467-b88c-e59ea43890a1",
"metadata": {},
"outputs": [],
"source": [
"QUERY = '''\n",
"SELECT\n",
" ?glottocode\n",
" (strafter(str(?languoid), str(wd:)) AS ?qid)\n",
" (?languoidLabel AS ?name)\n",
" (strafter(str(?siteLink), \"https://en.wikipedia.org/wiki/\") AS ?title)\n",
"WHERE {\n",
" ?languoid wdt:P1394 ?glottocode.\n",
" FILTER (REGEX(?glottocode, \"^[a-z0-9]{4}[0-9]{4}$\")).\n",
" OPTIONAL {\n",
" ?siteLink schema:about ?languoid;\n",
" schema:inLanguage \"en\";\n",
" schema:isPartOf <https://en.wikipedia.org/>.\n",
" }\n",
" SERVICE wikibase:label {\n",
" bd:serviceParam wikibase:language \"en\".\n",
" ?languoid rdfs:label ?languoidLabel.\n",
" }\n",
"}\n",
"ORDER BY\n",
" ?glottocode\n",
" xsd:integer(strafter(str(?languoid), str(wd:Q)))\n",
"LIMIT 15000\n",
"'''.strip()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "4ad0ee4a-44d0-4e24-a2c0-5be14d846795",
"metadata": {},
"outputs": [],
"source": [
"TEST_QUERY = '''\n",
"SELECT ?index ?string ?integer ?boolean ?decimal ?double ?float ?datetime ?date ?time\n",
"WHERE {\n",
" VALUES ?index { 0 }\n",
" VALUES ?string { \"spam\" \"eggs\"^^xsd:string }\n",
" VALUES ?integer { 7 \"42\"^^xsd:integer }\n",
" VALUES ?boolean { true false }\n",
" VALUES ?decimal { 1.3 }\n",
" VALUES ?double { 1.0e6 }\n",
" VALUES ?float { \"6.275\"^^xsd:float }\n",
" VALUES ?datetime { \"2005-04-04T04:04:04\"^^xsd:dateTime }\n",
" VALUES ?date { \"2001-01-01\"^^xsd:date }\n",
" VALUES ?time { \"18:30\"^^xsd:time }\n",
"}\n",
"'''.strip()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "beb198ed-6fc0-4a53-ba57-49f4cb9c060d",
"metadata": {},
"outputs": [],
"source": [
"LOGLEVEL = logging.INFO"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3bea886d-f8e3-49e0-9b9a-8113f33f6035",
"metadata": {},
"outputs": [],
"source": [
"logging.basicConfig(format='[%(levelname)s@%(name)s] %(message)s', level=LOGLEVEL, force=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "86aede62-2837-4013-9801-3778a6c1a21b",
"metadata": {},
"outputs": [],
"source": [
"class QueryPandas:\n",
"\n",
" def __init__(self, endpoint: str, **kwargs) -> None:\n",
" super().__init__(**kwargs)\n",
" self.endpoint=endpoint\n",
"\n",
" def __repr__(self) -> str:\n",
" return f'{self.__class__.__name__}({self.endpoint!r})'\n",
" \n",
" def query_pandas(self, sparql: str, **kwargs) -> pd.DataFrame:\n",
" result = self.query(sparql)\n",
"\n",
" df = self.read_sparql_query_result(result, **kwargs)\n",
"\n",
" with io.StringIO() as f:\n",
" df.info(buf=f, memory_usage='deep')\n",
" logging.debug(f.getvalue())\n",
"\n",
" return df\n",
"\n",
" def query(self, sparql: str):\n",
" raise NotImplementedError \n",
" \n",
" def read_sparql_query_result(self, result, **kwargs) -> pd.DataFrame:\n",
" raise NotImplementedError"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "44f9aadc-8d2c-4a59-a5b7-f58cc14db1d6",
"metadata": {},
"outputs": [],
"source": [
"def pd_dataframe_from_records(records, **kwargs) -> pd.DataFrame:\n",
" kwargs.setdefault('coerce_float', True)\n",
" logging.info('pandas.DataFrame.from_records(%r, **%r)', records, kwargs)\n",
" return pd.DataFrame.from_records(records, **kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "a96dcb4d-ae5a-4b84-b0dd-5ea6bfc859c1",
"metadata": {},
"outputs": [],
"source": [
"def pd_read_csv(data, **kwargs) -> pd.DataFrame:\n",
" kwargs.setdefault('na_values', '')\n",
" kwargs.setdefault('keep_default_na', False)\n",
"\n",
" if kwargs.get('encoding') is None:\n",
" content_type = data.info()['content-type']\n",
" kwargs['encoding'] = get_encoding(content_type)\n",
" logging.debug('encoding: %r', kwargs['encoding'])\n",
"\n",
" logging.info('pandas.read_csv(%r, **%r)', data, kwargs)\n",
" return pd.read_csv(data, **kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "3366b708-968f-4e85-8c25-6cb0819e0ab2",
"metadata": {},
"outputs": [],
"source": [
"def get_encoding(content_type: str) -> str: \n",
" _, sep, encoding = content_type.partition(';charset=')\n",
" assert sep and encoding\n",
" return encoding"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "bea6da0c-6b5e-4998-b97c-002af7e41e89",
"metadata": {},
"outputs": [],
"source": [
"def pd_json_normalize(jsondata, **kwargs) -> pd.DataFrame:\n",
" kwargs.setdefault('record_path', ['results', 'bindings'])\n",
" logging.info('pandas.json_normalize(jsondata, **%r)', kwargs)\n",
" return pd.json_normalize(jsondata, **kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "9f0ca230-7848-4ec1-92b4-70389cfcca02",
"metadata": {},
"outputs": [],
"source": [
"def pipe_info(df) -> pd.DataFrame:\n",
" df.info(memory_usage='deep')\n",
" return df"
]
},
{
"cell_type": "markdown",
"id": "be2d9827-0273-462c-b712-7d9d276694ee",
"metadata": {},
"source": [
"## urllib"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "03b875f6-397f-41a6-9479-d9889013b66e",
"metadata": {},
"outputs": [],
"source": [
"class QueryUrllib(QueryPandas):\n",
"\n",
" method = 'GET'\n",
" \n",
" headers = None\n",
" \n",
" def query(self, sparql: str, *, method=None, headers=None):\n",
" logging.info('endpoint: %r', self.endpoint)\n",
" request_url = (urllib.parse.urlparse(self.endpoint)\n",
" ._replace(query=urllib.parse.urlencode({'query': sparql})))\n",
"\n",
" request = urllib.request.Request(request_url.geturl(),\n",
" method=method if method is not None else self.method,\n",
" headers=headers if headers is not None else self.headers)\n",
" logging.info('request: %r %r', request.method, request)\n",
" logging.debug('url: %r', request.full_url)\n",
"\n",
" response = urllib.request.urlopen(request)\n",
" logging.info('response: %r %r', response.code, response)\n",
" logging.info('content-type: %r', response.headers['content-type'])\n",
" logging.debug('headers: %r', dict(response.headers))\n",
"\n",
" return types.SimpleNamespace(response=response, headers=response.headers, info=response.info)"
]
},
{
"cell_type": "markdown",
"id": "4c85659a-a85d-4cd1-8d41-74346c44106e",
"metadata": {
"tags": []
},
"source": [
"### ``CsvUrllibStrategy``"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "e10dfe89-4693-4409-8993-7e73f95c2718",
"metadata": {},
"outputs": [],
"source": [
"def read_sparql_query_csv(result, **kwargs) -> pd.DataFrame:\n",
" return pd_read_csv(result.response, **kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "aebf75f8-9a76-400a-8b03-af6f232816a5",
"metadata": {},
"outputs": [],
"source": [
"class CsvUrllibStrategy(QueryUrllib):\n",
"\n",
" headers = {'Accept': 'text/csv'}\n",
"\n",
" read_sparql_query_result = staticmethod(read_sparql_query_csv)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "a6b910e8-b162-4922-81ca-30dcf2711208",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n",
"[INFO@root] request: 'GET' <urllib.request.Request object at 0x000001E1EEB98F40>\n",
"[INFO@root] response: 200 <http.client.HTTPResponse object at 0x000001E1EEB9A590>\n",
"[INFO@root] content-type: 'text/csv;charset=utf-8'\n",
"[INFO@root] pandas.read_csv(<http.client.HTTPResponse object at 0x000001E1EEB9A590>, **{'index_col': 'index', 'na_values': '', 'keep_default_na': False, 'encoding': 'utf-8'})\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 8 entries, 0 to 0\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 string 8 non-null object \n",
" 1 integer 8 non-null int64 \n",
" 2 boolean 8 non-null bool \n",
" 3 decimal 8 non-null float64 \n",
" 4 double 8 non-null float64 \n",
" 5 float 8 non-null float64 \n",
" 6 datetime 8 non-null datetime64[ns]\n",
" 7 date 8 non-null object \n",
" 8 time 8 non-null object \n",
"dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n",
"memory usage: 1.9 KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>string</th>\n",
" <th>integer</th>\n",
" <th>boolean</th>\n",
" <th>decimal</th>\n",
" <th>double</th>\n",
" <th>float</th>\n",
" <th>datetime</th>\n",
" <th>date</th>\n",
" <th>time</th>\n",
" </tr>\n",
" <tr>\n",
" <th>index</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>7</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>42</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>42</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>7</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>42</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>42</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" string integer boolean decimal double float \\\n",
"index \n",
"0 'spam' 7 True 1.3 1000000.0 6.275 \n",
"0 'spam' 7 False 1.3 1000000.0 6.275 \n",
"0 'spam' 42 True 1.3 1000000.0 6.275 \n",
"0 'spam' 42 False 1.3 1000000.0 6.275 \n",
"0 'eggs' 7 True 1.3 1000000.0 6.275 \n",
"0 'eggs' 7 False 1.3 1000000.0 6.275 \n",
"0 'eggs' 42 True 1.3 1000000.0 6.275 \n",
"0 'eggs' 42 False 1.3 1000000.0 6.275 \n",
"\n",
" datetime date time \n",
"index \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(CsvUrllibStrategy(ENDPOINT).query_pandas(TEST_QUERY, index_col='index')\n",
" .astype({'datetime': 'datetime64'}).pipe(pipe_info).applymap(repr))"
]
},
{
"cell_type": "markdown",
"id": "911ce4dd-3ea8-4cb7-b552-0c3a21b2e2eb",
"metadata": {},
"source": [
"### ``XmlUrllibStrategy``"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "b915473b-14f1-4bd6-841c-56b1bce34192",
"metadata": {},
"outputs": [],
"source": [
"SPARQL_RESULTS = 'http://www.w3.org/2005/sparql-results#'"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "4e57a676-dd62-4bc3-94df-0c6c13db6f62",
"metadata": {},
"outputs": [],
"source": [
"def extract_ns(tag) -> str:\n",
" ns = tag.partition('{')[2].partition('}')[0]\n",
" assert tag.startswith(f'{{{ns}}}')\n",
" return ns"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "87447a7e-d491-4008-ae06-00433b93dedb",
"metadata": {},
"outputs": [],
"source": [
"NS = f'{{{SPARQL_RESULTS}}}'"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "ff74fd77-d64e-4434-9eaf-488b3f75069d",
"metadata": {},
"outputs": [],
"source": [
"def read_sparql_query_xml(result, *, encoding=None, **kwargs) -> pd.DataFrame:\n",
" if encoding is None:\n",
" content_type = result.info()['content-type']\n",
" encoding = get_encoding(content_type)\n",
" logging.debug('encoding: %r', encoding)\n",
"\n",
" pairs = etree.iterparse(result.response, events=('start', 'end'),\n",
" parser=etree.XMLParser(encoding=encoding))\n",
" _, root = next(pairs)\n",
" logging.info('xml: %r', root)\n",
"\n",
" ns = extract_ns(root.tag)\n",
" if ns != SPARQL_RESULTS:\n",
" raise ValueError(f'error: unknown xml namespace {ns!r} (expected: {SPARQL_RESULTS!r})')\n",
" ns = f'{{{ns}}}'\n",
"\n",
" sparql_root = f'{ns}sparql'\n",
" if root.tag != sparql_root:\n",
" raise ValueError(f'error: invalid xml root tag {root.tag!r} (expected: {sparql_root!r})')\n",
"\n",
" sparql_head = f'{ns}head'\n",
" head = next(elem for event, elem in pairs if event == 'end' and elem.tag == sparql_head)\n",
" variables = [variable.attrib['name'] for variable in head.findall(f'{ns}variable')]\n",
" kwargs.setdefault('columns', variables)\n",
"\n",
" records = iterrecords(root, pairs, variables=variables)\n",
" return pd_dataframe_from_records(records, **kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "5d3903f1-b6a8-4049-a079-15fe856357ba",
"metadata": {},
"outputs": [],
"source": [
"def iterrecords(root, pairs, *, variables, sparql_result: str = f'{NS}result'):\n",
" sparql_values = [f'{NS}binding[@name=\"{name}\"]/' for name in variables]\n",
" for event, elem in pairs:\n",
" if event != 'end':\n",
" pass\n",
" elif elem.tag == sparql_result: \n",
" value_elements = map(elem.find, sparql_values)\n",
" yield tuple(itervalues(value_elements))\n",
" root.clear()\n",
" elif elem.tag == root.tag:\n",
" assert next(pairs, None) is None\n",
" return"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "32b4e966-a4dd-4d83-8ba3-7f2a93cb844e",
"metadata": {},
"outputs": [],
"source": [
"def itervalues(value_elements, *,\n",
" sparql_literal: str = f'{NS}literal',\n",
" sparql_uri: str = f'{NS}uri',\n",
" sparql_bnode: str = f'{NS}bnode',):\n",
" for value_elem in value_elements:\n",
" if value_elem is None:\n",
" yield None\n",
" continue\n",
"\n",
" if value_elem.tag not in (sparql_literal, sparql_uri, sparql_bnode):\n",
" raise ValueError(f'invalid binding value tag: {value_elem.tag!r}')\n",
" value = value_elem.text\n",
" if value_elem.tag == sparql_literal and 'datatype' in value_elem.attrib:\n",
" datatype = value_elem.attrib['datatype']\n",
" if datatype is not None:\n",
" try:\n",
" parse_value = PARSE_FUNC[datatype]\n",
" except KeyError:\n",
" warnings.warn(f'cannot convert unknown datatype: {datatype!r}')\n",
" else:\n",
" value = parse_value(value)\n",
" yield value"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "0b4f1fab-4f46-4c6b-b107-bffc6388f775",
"metadata": {},
"outputs": [],
"source": [
"XSD = 'http://www.w3.org/2001/XMLSchema#'"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "2d68fcef-ef94-4b3a-9dc5-de49b212cce0",
"metadata": {},
"outputs": [],
"source": [
"PARSE_FUNC = {f'{XSD}string': lambda x: x,\n",
" f'{XSD}integer': int,\n",
" f'{XSD}boolean': lambda x: bool(distutils.util.strtobool(x)),\n",
" f'{XSD}decimal': decimal.Decimal,\n",
" f'{XSD}double': float,\n",
" f'{XSD}float': float,\n",
" f'{XSD}dateTime': datetime.datetime.fromisoformat,\n",
" f'{XSD}date': datetime.date.fromisoformat,\n",
" f'{XSD}time': datetime.time.fromisoformat}"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "29bf61ef-0328-478a-a877-30fbd45eeeca",
"metadata": {},
"outputs": [],
"source": [
"class XmlUrllibStrategy(QueryUrllib):\n",
"\n",
" headers = {'Accept': 'application/sparql-results+xml'}\n",
"\n",
" read_sparql_query_result = staticmethod(read_sparql_query_xml)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "95d5889f-e830-4b68-839d-5558d8fcc959",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n",
"[INFO@root] request: 'GET' <urllib.request.Request object at 0x000001E1EEC03C10>\n",
"[INFO@root] response: 200 <http.client.HTTPResponse object at 0x000001E1EEC01F60>\n",
"[INFO@root] content-type: 'application/sparql-results+xml;charset=utf-8'\n",
"[INFO@root] xml: <Element '{http://www.w3.org/2005/sparql-results#}sparql' at 0x000001E1EEC3DB70>\n",
"[INFO@root] pandas.DataFrame.from_records(<generator object iterrecords at 0x000001E1EEBABBC0>, **{'index': 'index', 'columns': ['index', 'string', 'integer', 'boolean', 'decimal', 'double', 'float', 'datetime', 'date', 'time'], 'coerce_float': True})\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 8 entries, 0 to 0\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 string 8 non-null object \n",
" 1 integer 8 non-null int64 \n",
" 2 boolean 8 non-null bool \n",
" 3 decimal 8 non-null float64 \n",
" 4 double 8 non-null float64 \n",
" 5 float 8 non-null float64 \n",
" 6 datetime 8 non-null datetime64[ns]\n",
" 7 date 8 non-null object \n",
" 8 time 8 non-null object \n",
"dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n",
"memory usage: 1.5 KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>string</th>\n",
" <th>integer</th>\n",
" <th>boolean</th>\n",
" <th>decimal</th>\n",
" <th>double</th>\n",
" <th>float</th>\n",
" <th>datetime</th>\n",
" <th>date</th>\n",
" <th>time</th>\n",
" </tr>\n",
" <tr>\n",
" <th>index</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>7</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>42</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>42</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>7</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>42</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>42</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" string integer boolean decimal double float \\\n",
"index \n",
"0 'spam' 7 True 1.3 1000000.0 6.275 \n",
"0 'spam' 7 False 1.3 1000000.0 6.275 \n",
"0 'spam' 42 True 1.3 1000000.0 6.275 \n",
"0 'spam' 42 False 1.3 1000000.0 6.275 \n",
"0 'eggs' 7 True 1.3 1000000.0 6.275 \n",
"0 'eggs' 7 False 1.3 1000000.0 6.275 \n",
"0 'eggs' 42 True 1.3 1000000.0 6.275 \n",
"0 'eggs' 42 False 1.3 1000000.0 6.275 \n",
"\n",
" datetime date \\\n",
"index \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"\n",
" time \n",
"index \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"XmlUrllibStrategy(ENDPOINT).query_pandas(TEST_QUERY, index='index').pipe(pipe_info).applymap(repr)"
]
},
{
"cell_type": "markdown",
"id": "a490428a-aeef-4a07-a886-66c992b031e9",
"metadata": {},
"source": [
"## rdflib"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "6a78e26c-e9d9-408c-8726-35507a76f3af",
"metadata": {},
"outputs": [],
"source": [
"class SCHEMA(rdflib.SDO):\n",
" \"\"\"https://github.com/RDFLib/rdflib/issues/1120\"\"\"\n",
"\n",
" _NS = rdflib.Namespace(rdflib.SDO._NS.replace('https://', 'http://'))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "7919a1c7-e917-4f55-b7e7-fd950226616b",
"metadata": {},
"outputs": [],
"source": [
"def open_sparql_graph(endpoint: str, *, prefixes={'schema': SCHEMA}):\n",
" logging.info('endpoint: %r', endpoint)\n",
" graph = rdflib.ConjunctiveGraph('SPARQLStore')\n",
" graph.open(endpoint)\n",
" logging.info('graph: %s', graph)\n",
"\n",
" logging.info('prefixes: %r', prefixes)\n",
" for prefix, namespace in prefixes.items():\n",
" graph.namespace_manager.bind(prefix, namespace, replace=True)\n",
"\n",
" logging.debug('namespaces: %r', list(graph.namespaces()))\n",
" return graph"
]
},
{
"cell_type": "markdown",
"id": "e2a7cf6d-7d0d-471d-b09c-43be532a2eaf",
"metadata": {},
"source": [
"### `rdflib.ConjunctiveGraph('SPARQLStore')`"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "3ad3eee6-0a03-4a77-9972-858521ded602",
"metadata": {},
"outputs": [],
"source": [
"def read_sparql_query_graph(result, **kwargs) -> pd.DataFrame:\n",
" variables = [v.toPython().removeprefix('?') for v in result.vars]\n",
" kwargs.setdefault('columns', variables)\n",
" records = ([v.toPython() if v is not None else None for v in values]\n",
" for values in result)\n",
" return pd_dataframe_from_records(records, **kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "771a038a-e21d-4415-b11a-720ce570b804",
"metadata": {},
"outputs": [],
"source": [
"class GraphXmlStrategy(QueryPandas):\n",
" \n",
" def __repr__(self) -> str:\n",
" return f'<{self.__class__.__name__} {self.endpoint!r} graph={self.graph!r}>'\n",
"\n",
" @functools.cached_property\n",
" def graph(self):\n",
" return open_sparql_graph(self.endpoint)\n",
"\n",
" def query(self, query: str):\n",
" result = self.graph.query(query)\n",
" logging.info('result: %r', result)\n",
" return result\n",
"\n",
" read_sparql_query_result = staticmethod(read_sparql_query_graph)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "e711ce12-c947-4525-b8c5-e0ff656ff970",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n",
"[INFO@root] graph: [a rdflib:ConjunctiveGraph;rdflib:storage [a rdflib:Store;rdfs:label 'SPARQLStore']]\n",
"[INFO@root] prefixes: {'schema': Namespace(\"http://schema.org/\")}\n",
"[INFO@root] result: <rdflib.plugins.sparql.results.xmlresults.XMLResult object at 0x000001E1EEC99990>\n",
"[INFO@root] pandas.DataFrame.from_records(<generator object read_sparql_query_graph.<locals>.<genexpr> at 0x000001E1EF0D1310>, **{'index': 'index', 'columns': ['index', 'string', 'integer', 'boolean', 'decimal', 'double', 'float', 'datetime', 'date', 'time'], 'coerce_float': True})\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 8 entries, 0 to 0\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 string 8 non-null object \n",
" 1 integer 8 non-null int64 \n",
" 2 boolean 8 non-null bool \n",
" 3 decimal 8 non-null float64 \n",
" 4 double 8 non-null float64 \n",
" 5 float 8 non-null float64 \n",
" 6 datetime 8 non-null datetime64[ns]\n",
" 7 date 8 non-null object \n",
" 8 time 8 non-null object \n",
"dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n",
"memory usage: 1.5 KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>string</th>\n",
" <th>integer</th>\n",
" <th>boolean</th>\n",
" <th>decimal</th>\n",
" <th>double</th>\n",
" <th>float</th>\n",
" <th>datetime</th>\n",
" <th>date</th>\n",
" <th>time</th>\n",
" </tr>\n",
" <tr>\n",
" <th>index</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>7</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>42</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>42</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>7</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>42</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>42</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" string integer boolean decimal double float \\\n",
"index \n",
"0 'spam' 7 True 1.3 1000000.0 6.275 \n",
"0 'spam' 7 False 1.3 1000000.0 6.275 \n",
"0 'spam' 42 True 1.3 1000000.0 6.275 \n",
"0 'spam' 42 False 1.3 1000000.0 6.275 \n",
"0 'eggs' 7 True 1.3 1000000.0 6.275 \n",
"0 'eggs' 7 False 1.3 1000000.0 6.275 \n",
"0 'eggs' 42 True 1.3 1000000.0 6.275 \n",
"0 'eggs' 42 False 1.3 1000000.0 6.275 \n",
"\n",
" datetime date \\\n",
"index \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"\n",
" time \n",
"index \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) "
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"GraphXmlStrategy(ENDPOINT).query_pandas(TEST_QUERY, index='index').pipe(pipe_info).applymap(repr)"
]
},
{
"cell_type": "markdown",
"id": "6d30940d-4cde-44ee-a767-a98a538dbb08",
"metadata": {},
"source": [
"## SPARQLWrapper"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "62fed472-860d-4ea5-bf37-0f93eaebffd8",
"metadata": {},
"outputs": [],
"source": [
"class QuerySPARQLWrapper:\n",
"\n",
" def __repr__(self) -> str:\n",
" return f'{self.__class__.__name__}({self.endpoint!r}, returnFormat={self.returnFormat!r})'"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "db4a18f8-d06a-473c-bded-65bfe509eefa",
"metadata": {},
"outputs": [],
"source": [
"class QueryService(QuerySPARQLWrapper, sparqlwrapper.SPARQLWrapper, QueryPandas):\n",
" \"\"\"Query service.\"\"\"\n",
"\n",
" def query(self, sparql: str):\n",
" logging.info('endpoint: %r', self.endpoint)\n",
" logging.info('returnFormat: %r', self.returnFormat)\n",
" if self.returnFormat in (sparqlwrapper.CSV, sparqlwrapper.TSV):\n",
" logging.info('%r.setOnlyConnreg(True)', self)\n",
" self.setOnlyConneg(True)\n",
" elif not self.supportsReturnFormat(self.returnFormat):\n",
" raise ValueError('unsupposted return format: %r', self.returnFormat)\n",
" self.setQuery(sparql)\n",
" result = super().query()\n",
" logging.info('result: %r', result)\n",
" headers = result.info()\n",
" logging.debug('headers: %r', headers)\n",
" logging.info('content_type: %r', headers['content-type'])\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "a3b5dd29-ca50-4088-a568-90bac6d604fc",
"metadata": {},
"outputs": [],
"source": [
"class JsonReturnFormatQueryService(QuerySPARQLWrapper, sparqlwrapper.SPARQLWrapper2, QueryPandas):\n",
" \"\"\"Query service with JSON return format.\"\"\"\n",
" \n",
" def query(self, sparql: str):\n",
" logging.info('endpoint: %r', self.endpoint)\n",
" self.setQuery(sparql)\n",
" result = super().query()\n",
" logging.info('result: %r', result)\n",
" logging.info('result.variables: %r', result.variables)\n",
" return result"
]
},
{
"cell_type": "markdown",
"id": "ca4321ee-3a45-4c36-8a4c-617a13a5140b",
"metadata": {},
"source": [
"### `SPARQLWrapper.SPARQLWrapper(returnFormat='csv')`"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "fb82bc0a-8fea-4478-9eb7-8d7af008dbf0",
"metadata": {},
"outputs": [],
"source": [
"class CsvSwStrategy(QueryService):\n",
"\n",
" def __init__(self, endpoint: str, *, returnFormat=sparqlwrapper.CSV, **kwargs) -> None:\n",
" super().__init__(endpoint, returnFormat=returnFormat, **kwargs)\n",
"\n",
" read_sparql_query_result = staticmethod(read_sparql_query_csv)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "0e59b728-e7dc-426a-8c85-cbd6fdc03c44",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n",
"[INFO@root] returnFormat: 'csv'\n",
"[INFO@root] CsvSwStrategy('https://query.wikidata.org/sparql', returnFormat='csv').setOnlyConnreg(True)\n",
"[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000001E1EF0FBB80>\n",
"[INFO@root] content_type: 'text/csv;charset=utf-8'\n",
"[INFO@root] pandas.read_csv(<http.client.HTTPResponse object at 0x000001E1EF0FBCD0>, **{'index_col': 'index', 'na_values': '', 'keep_default_na': False, 'encoding': 'utf-8'})\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 8 entries, 0 to 0\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 string 8 non-null object \n",
" 1 integer 8 non-null int64 \n",
" 2 boolean 8 non-null bool \n",
" 3 decimal 8 non-null float64 \n",
" 4 double 8 non-null float64 \n",
" 5 float 8 non-null float64 \n",
" 6 datetime 8 non-null datetime64[ns]\n",
" 7 date 8 non-null object \n",
" 8 time 8 non-null object \n",
"dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n",
"memory usage: 1.9 KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>string</th>\n",
" <th>integer</th>\n",
" <th>boolean</th>\n",
" <th>decimal</th>\n",
" <th>double</th>\n",
" <th>float</th>\n",
" <th>datetime</th>\n",
" <th>date</th>\n",
" <th>time</th>\n",
" </tr>\n",
" <tr>\n",
" <th>index</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>7</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>42</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>42</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>7</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>42</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>42</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" string integer boolean decimal double float \\\n",
"index \n",
"0 'spam' 7 True 1.3 1000000.0 6.275 \n",
"0 'spam' 7 False 1.3 1000000.0 6.275 \n",
"0 'spam' 42 True 1.3 1000000.0 6.275 \n",
"0 'spam' 42 False 1.3 1000000.0 6.275 \n",
"0 'eggs' 7 True 1.3 1000000.0 6.275 \n",
"0 'eggs' 7 False 1.3 1000000.0 6.275 \n",
"0 'eggs' 42 True 1.3 1000000.0 6.275 \n",
"0 'eggs' 42 False 1.3 1000000.0 6.275 \n",
"\n",
" datetime date time \n",
"index \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' "
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(CsvSwStrategy(ENDPOINT).query_pandas(TEST_QUERY, index_col='index')\n",
" .astype({'datetime': 'datetime64'}).pipe(pipe_info).applymap(repr))"
]
},
{
"cell_type": "markdown",
"id": "947aa733-b6ff-47cf-a862-d0ab23ac147c",
"metadata": {},
"source": [
"### `SPARQLWrapper.SPARQLWrapper(returnFormat='xml')`"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "64ffe095-4c87-4b80-bb93-077c18a39c5c",
"metadata": {},
"outputs": [],
"source": [
"class XmlSwStrategy(QueryService):\n",
"\n",
" read_sparql_query_result = staticmethod(read_sparql_query_xml)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "bcf515da-e4a7-45e2-b4f0-9c38fba6895b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n",
"[INFO@root] returnFormat: 'xml'\n",
"[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000001E1EF0FB670>\n",
"[INFO@root] content_type: 'application/sparql-results+xml;charset=utf-8'\n",
"[INFO@root] xml: <Element '{http://www.w3.org/2005/sparql-results#}sparql' at 0x000001E1EF12B060>\n",
"[INFO@root] pandas.DataFrame.from_records(<generator object iterrecords at 0x000001E1EF0D1D20>, **{'index': 'index', 'columns': ['index', 'string', 'integer', 'boolean', 'decimal', 'double', 'float', 'datetime', 'date', 'time'], 'coerce_float': True})\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 8 entries, 0 to 0\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 string 8 non-null object \n",
" 1 integer 8 non-null int64 \n",
" 2 boolean 8 non-null bool \n",
" 3 decimal 8 non-null float64 \n",
" 4 double 8 non-null float64 \n",
" 5 float 8 non-null float64 \n",
" 6 datetime 8 non-null datetime64[ns]\n",
" 7 date 8 non-null object \n",
" 8 time 8 non-null object \n",
"dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n",
"memory usage: 1.5 KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>string</th>\n",
" <th>integer</th>\n",
" <th>boolean</th>\n",
" <th>decimal</th>\n",
" <th>double</th>\n",
" <th>float</th>\n",
" <th>datetime</th>\n",
" <th>date</th>\n",
" <th>time</th>\n",
" </tr>\n",
" <tr>\n",
" <th>index</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>7</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>42</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>42</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>7</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>42</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>42</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" string integer boolean decimal double float \\\n",
"index \n",
"0 'spam' 7 True 1.3 1000000.0 6.275 \n",
"0 'spam' 7 False 1.3 1000000.0 6.275 \n",
"0 'spam' 42 True 1.3 1000000.0 6.275 \n",
"0 'spam' 42 False 1.3 1000000.0 6.275 \n",
"0 'eggs' 7 True 1.3 1000000.0 6.275 \n",
"0 'eggs' 7 False 1.3 1000000.0 6.275 \n",
"0 'eggs' 42 True 1.3 1000000.0 6.275 \n",
"0 'eggs' 42 False 1.3 1000000.0 6.275 \n",
"\n",
" datetime date \\\n",
"index \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"\n",
" time \n",
"index \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"XmlSwStrategy(ENDPOINT).query_pandas(TEST_QUERY, index='index').pipe(pipe_info).applymap(repr)"
]
},
{
"cell_type": "markdown",
"id": "b167836d-de6c-4f99-a840-ff05cb41d623",
"metadata": {},
"source": [
"### `SPARQLWrapper.SPARQLWrapper2()`"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "69ea97b4-8814-45c6-ad0f-aeb27896a59a",
"metadata": {},
"outputs": [],
"source": [
"def read_sparql_query_json_result(result, **kwargs) -> pd.DataFrame:\n",
" kwargs.setdefault('columns', result.variables)\n",
" records = (tuple(itervalues_json(binding, variables=result.variables))\n",
" for binding in result.bindings)\n",
" return pd_dataframe_from_records(records, **kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "688cfb61-97ba-4aa4-b794-146c00ff5b2c",
"metadata": {},
"outputs": [],
"source": [
"def itervalues_json(binding, *, variables):\n",
" for v in variables:\n",
" if v not in binding:\n",
" yield None\n",
" continue\n",
"\n",
" value_dict = binding[v]\n",
" if value_dict.type not in ('literal', 'uri', 'bnode'):\n",
" raise ValueError(f'invalid binding value type {b.type!r}')\n",
" value = value_dict.value\n",
" if value_dict.type == 'literal' and value_dict.datatype is not None:\n",
" try:\n",
" parse_value = PARSE_FUNC[value_dict.datatype]\n",
" except KeyError:\n",
" warnings.warn('cannot convert unmatched datatype: {value_dict!r}')\n",
" else:\n",
" value = parse_value(value)\n",
" yield value"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "fd10ccbd-65b2-4dcc-8fc5-2ac154ce3833",
"metadata": {},
"outputs": [],
"source": [
"class JsonResultStrategy(JsonReturnFormatQueryService):\n",
"\n",
" read_sparql_query_result = staticmethod(read_sparql_query_json_result)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "b22dbc5a-cc0e-405b-ae2e-0ab3cf2e7fc6",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n",
"[INFO@root] result: <SPARQLWrapper.SmartWrapper.Bindings object at 0x000001E1EF0FB9D0>\n",
"[INFO@root] result.variables: ['index', 'string', 'integer', 'boolean', 'decimal', 'double', 'float', 'datetime', 'date', 'time']\n",
"[INFO@root] pandas.DataFrame.from_records(<generator object read_sparql_query_json_result.<locals>.<genexpr> at 0x000001E1EF0D2180>, **{'index': 'index', 'columns': ['index', 'string', 'integer', 'boolean', 'decimal', 'double', 'float', 'datetime', 'date', 'time'], 'coerce_float': True})\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 8 entries, 0 to 0\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 string 8 non-null object \n",
" 1 integer 8 non-null int64 \n",
" 2 boolean 8 non-null bool \n",
" 3 decimal 8 non-null float64 \n",
" 4 double 8 non-null float64 \n",
" 5 float 8 non-null float64 \n",
" 6 datetime 8 non-null datetime64[ns]\n",
" 7 date 8 non-null object \n",
" 8 time 8 non-null object \n",
"dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n",
"memory usage: 1.5 KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>string</th>\n",
" <th>integer</th>\n",
" <th>boolean</th>\n",
" <th>decimal</th>\n",
" <th>double</th>\n",
" <th>float</th>\n",
" <th>datetime</th>\n",
" <th>date</th>\n",
" <th>time</th>\n",
" </tr>\n",
" <tr>\n",
" <th>index</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>7</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>42</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>42</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>7</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>42</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>42</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>datetime.date(2001, 1, 1)</td>\n",
" <td>datetime.time(18, 30)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" string integer boolean decimal double float \\\n",
"index \n",
"0 'spam' 7 True 1.3 1000000.0 6.275 \n",
"0 'spam' 7 False 1.3 1000000.0 6.275 \n",
"0 'spam' 42 True 1.3 1000000.0 6.275 \n",
"0 'spam' 42 False 1.3 1000000.0 6.275 \n",
"0 'eggs' 7 True 1.3 1000000.0 6.275 \n",
"0 'eggs' 7 False 1.3 1000000.0 6.275 \n",
"0 'eggs' 42 True 1.3 1000000.0 6.275 \n",
"0 'eggs' 42 False 1.3 1000000.0 6.275 \n",
"\n",
" datetime date \\\n",
"index \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n",
"\n",
" time \n",
"index \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) \n",
"0 datetime.time(18, 30) "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"JsonResultStrategy(ENDPOINT).query_pandas(TEST_QUERY, index='index').pipe(pipe_info).applymap(repr)"
]
},
{
"cell_type": "markdown",
"id": "d020a3fc-248c-4537-a0ad-17a387272863",
"metadata": {},
"source": [
"### `SPARQLWrapper.SPARQLWrapper(returnFormat='json')`"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "650f5ce1-b0c4-497d-9665-42409061e0b9",
"metadata": {},
"outputs": [],
"source": [
"def read_sparql_query_json_normalize(result, *, raw: bool = False, **kwargs) -> pd.DataFrame:\n",
" jsondata = result.convert()\n",
" logging.info(\"result['head']: %r\", jsondata['head'])\n",
" return _read_sparql_query_json_normalize(jsondata, raw=raw, **kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "25aaec36-4cbc-4c7c-8636-413079ecb93c",
"metadata": {},
"outputs": [],
"source": [
"PARSE_DTYPE = {f'{XSD}string': 'string',\n",
" f'{XSD}integer': 'int',\n",
" f'{XSD}boolean': 'bool',\n",
" f'{XSD}decimal': 'float',\n",
" f'{XSD}double': 'float',\n",
" f'{XSD}float': 'float',\n",
" f'{XSD}dateTime': 'datetime64',\n",
" f'{XSD}date': 'string',\n",
" f'{XSD}time': 'string'}"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "aa31d885-1a16-4964-9d33-560a3047bde6",
"metadata": {},
"outputs": [],
"source": [
"def _read_sparql_query_json_normalize(jsondata, *, raw: bool, **kwargs) -> pd.DataFrame:\n",
" df = pd_json_normalize(jsondata, **kwargs)\n",
" if df.empty:\n",
" return df\n",
" if not raw:\n",
" columns = jsondata['head']['vars']\n",
" rename = {f'{c}.value': c for c in columns}\n",
" dtype = {name: PARSE_DTYPE[binding['datatype']]\n",
" if binding['type'] == 'literal' and 'datatype' in binding\n",
" else 'string'\n",
" for name, binding in jsondata['results']['bindings'][0].items()}\n",
" booleans = [name for name, d in dtype.items() if d in ('bool', 'boolean')]\n",
" df = df[list(rename)].rename(rename, axis='columns')\n",
" df[booleans] = (df[booleans]\n",
" .replace(r'^[Tt]rue$', True, regex=True)\n",
" .replace(r'^[Ff]alse$', False, regex=True))\n",
" return df.astype(dtype)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "b919c3be-569c-4b15-8256-c31598fded50",
"metadata": {},
"outputs": [],
"source": [
"class JsonNormalizeStrategy(QueryService):\n",
"\n",
" def __init__(self, endpoint, *, returnFormat=sparqlwrapper.JSON, **kwargs) -> None:\n",
" super().__init__(endpoint, returnFormat=returnFormat, **kwargs)\n",
"\n",
" read_sparql_query_result = staticmethod(read_sparql_query_json_normalize)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "7069bf4b-c35e-4e13-a901-25a5f90cd7dc",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n",
"[INFO@root] returnFormat: 'json'\n",
"[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000001E1EEB99CF0>\n",
"[INFO@root] content_type: 'application/sparql-results+json;charset=utf-8'\n",
"[INFO@root] result['head']: {'vars': ['index', 'string', 'integer', 'boolean', 'decimal', 'double', 'float', 'datetime', 'date', 'time']}\n",
"[INFO@root] pandas.json_normalize(jsondata, **{'record_path': ['results', 'bindings']})\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 8 entries, 0 to 0\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 string 8 non-null string \n",
" 1 integer 8 non-null int32 \n",
" 2 boolean 8 non-null bool \n",
" 3 decimal 8 non-null float64 \n",
" 4 double 8 non-null float64 \n",
" 5 float 8 non-null float64 \n",
" 6 datetime 8 non-null datetime64[ns]\n",
" 7 date 8 non-null string \n",
" 8 time 8 non-null string \n",
"dtypes: bool(1), datetime64[ns](1), float64(3), int32(1), string(3)\n",
"memory usage: 1.8 KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>string</th>\n",
" <th>integer</th>\n",
" <th>boolean</th>\n",
" <th>decimal</th>\n",
" <th>double</th>\n",
" <th>float</th>\n",
" <th>datetime</th>\n",
" <th>date</th>\n",
" <th>time</th>\n",
" </tr>\n",
" <tr>\n",
" <th>index</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>7</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>42</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'spam'</td>\n",
" <td>42</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>7</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>7</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>42</td>\n",
" <td>True</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'eggs'</td>\n",
" <td>42</td>\n",
" <td>False</td>\n",
" <td>1.3</td>\n",
" <td>1000000.0</td>\n",
" <td>6.275</td>\n",
" <td>Timestamp('2005-04-04 04:04:04')</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" string integer boolean decimal double float \\\n",
"index \n",
"0 'spam' 7 True 1.3 1000000.0 6.275 \n",
"0 'spam' 7 False 1.3 1000000.0 6.275 \n",
"0 'spam' 42 True 1.3 1000000.0 6.275 \n",
"0 'spam' 42 False 1.3 1000000.0 6.275 \n",
"0 'eggs' 7 True 1.3 1000000.0 6.275 \n",
"0 'eggs' 7 False 1.3 1000000.0 6.275 \n",
"0 'eggs' 42 True 1.3 1000000.0 6.275 \n",
"0 'eggs' 42 False 1.3 1000000.0 6.275 \n",
"\n",
" datetime date time \n",
"index \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n",
"0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' "
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"JsonNormalizeStrategy(ENDPOINT).query_pandas(TEST_QUERY).set_index('index').pipe(pipe_info).applymap(repr)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "ef657e62-7f89-413a-91a8-5bba2b3fefb8",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n",
"[INFO@root] returnFormat: 'json'\n",
"[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000001E1EEB998D0>\n",
"[INFO@root] content_type: 'application/sparql-results+json;charset=utf-8'\n",
"[INFO@root] result['head']: {'vars': ['index', 'string', 'integer', 'boolean', 'decimal', 'double', 'float', 'datetime', 'date', 'time']}\n",
"[INFO@root] pandas.json_normalize(jsondata, **{'sep': '/', 'record_path': ['results', 'bindings']})\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 8 entries, 0 to 0\n",
"Data columns (total 28 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 index/datatype 8 non-null object\n",
" 1 index/type 8 non-null object\n",
" 2 string/type 8 non-null object\n",
" 3 string/value 8 non-null object\n",
" 4 integer/datatype 8 non-null object\n",
" 5 integer/type 8 non-null object\n",
" 6 integer/value 8 non-null object\n",
" 7 boolean/datatype 8 non-null object\n",
" 8 boolean/type 8 non-null object\n",
" 9 boolean/value 8 non-null object\n",
" 10 decimal/datatype 8 non-null object\n",
" 11 decimal/type 8 non-null object\n",
" 12 decimal/value 8 non-null object\n",
" 13 double/datatype 8 non-null object\n",
" 14 double/type 8 non-null object\n",
" 15 double/value 8 non-null object\n",
" 16 float/datatype 8 non-null object\n",
" 17 float/type 8 non-null object\n",
" 18 float/value 8 non-null object\n",
" 19 datetime/datatype 8 non-null object\n",
" 20 datetime/type 8 non-null object\n",
" 21 datetime/value 8 non-null object\n",
" 22 date/datatype 8 non-null object\n",
" 23 date/type 8 non-null object\n",
" 24 date/value 8 non-null object\n",
" 25 time/datatype 8 non-null object\n",
" 26 time/type 8 non-null object\n",
" 27 time/value 8 non-null object\n",
"dtypes: object(28)\n",
"memory usage: 16.7 KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index/datatype</th>\n",
" <th>index/type</th>\n",
" <th>string/type</th>\n",
" <th>string/value</th>\n",
" <th>integer/datatype</th>\n",
" <th>integer/type</th>\n",
" <th>integer/value</th>\n",
" <th>boolean/datatype</th>\n",
" <th>boolean/type</th>\n",
" <th>boolean/value</th>\n",
" <th>...</th>\n",
" <th>float/value</th>\n",
" <th>datetime/datatype</th>\n",
" <th>datetime/type</th>\n",
" <th>datetime/value</th>\n",
" <th>date/datatype</th>\n",
" <th>date/type</th>\n",
" <th>date/value</th>\n",
" <th>time/datatype</th>\n",
" <th>time/type</th>\n",
" <th>time/value</th>\n",
" </tr>\n",
" <tr>\n",
" <th>index/value</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n",
" <td>'literal'</td>\n",
" <td>'literal'</td>\n",
" <td>'spam'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n",
" <td>'literal'</td>\n",
" <td>'7'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#boolean'</td>\n",
" <td>'literal'</td>\n",
" <td>'true'</td>\n",
" <td>...</td>\n",
" <td>'6.275'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#dateTime'</td>\n",
" <td>'literal'</td>\n",
" <td>'2005-04-04T04:04:04'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#date'</td>\n",
" <td>'literal'</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#time'</td>\n",
" <td>'literal'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n",
" <td>'literal'</td>\n",
" <td>'literal'</td>\n",
" <td>'spam'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n",
" <td>'literal'</td>\n",
" <td>'7'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#boolean'</td>\n",
" <td>'literal'</td>\n",
" <td>'false'</td>\n",
" <td>...</td>\n",
" <td>'6.275'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#dateTime'</td>\n",
" <td>'literal'</td>\n",
" <td>'2005-04-04T04:04:04'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#date'</td>\n",
" <td>'literal'</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#time'</td>\n",
" <td>'literal'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n",
" <td>'literal'</td>\n",
" <td>'literal'</td>\n",
" <td>'spam'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n",
" <td>'literal'</td>\n",
" <td>'42'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#boolean'</td>\n",
" <td>'literal'</td>\n",
" <td>'true'</td>\n",
" <td>...</td>\n",
" <td>'6.275'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#dateTime'</td>\n",
" <td>'literal'</td>\n",
" <td>'2005-04-04T04:04:04'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#date'</td>\n",
" <td>'literal'</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#time'</td>\n",
" <td>'literal'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n",
" <td>'literal'</td>\n",
" <td>'literal'</td>\n",
" <td>'spam'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n",
" <td>'literal'</td>\n",
" <td>'42'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#boolean'</td>\n",
" <td>'literal'</td>\n",
" <td>'false'</td>\n",
" <td>...</td>\n",
" <td>'6.275'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#dateTime'</td>\n",
" <td>'literal'</td>\n",
" <td>'2005-04-04T04:04:04'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#date'</td>\n",
" <td>'literal'</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#time'</td>\n",
" <td>'literal'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n",
" <td>'literal'</td>\n",
" <td>'literal'</td>\n",
" <td>'eggs'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n",
" <td>'literal'</td>\n",
" <td>'7'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#boolean'</td>\n",
" <td>'literal'</td>\n",
" <td>'true'</td>\n",
" <td>...</td>\n",
" <td>'6.275'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#dateTime'</td>\n",
" <td>'literal'</td>\n",
" <td>'2005-04-04T04:04:04'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#date'</td>\n",
" <td>'literal'</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#time'</td>\n",
" <td>'literal'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n",
" <td>'literal'</td>\n",
" <td>'literal'</td>\n",
" <td>'eggs'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n",
" <td>'literal'</td>\n",
" <td>'7'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#boolean'</td>\n",
" <td>'literal'</td>\n",
" <td>'false'</td>\n",
" <td>...</td>\n",
" <td>'6.275'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#dateTime'</td>\n",
" <td>'literal'</td>\n",
" <td>'2005-04-04T04:04:04'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#date'</td>\n",
" <td>'literal'</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#time'</td>\n",
" <td>'literal'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n",
" <td>'literal'</td>\n",
" <td>'literal'</td>\n",
" <td>'eggs'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n",
" <td>'literal'</td>\n",
" <td>'42'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#boolean'</td>\n",
" <td>'literal'</td>\n",
" <td>'true'</td>\n",
" <td>...</td>\n",
" <td>'6.275'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#dateTime'</td>\n",
" <td>'literal'</td>\n",
" <td>'2005-04-04T04:04:04'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#date'</td>\n",
" <td>'literal'</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#time'</td>\n",
" <td>'literal'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n",
" <td>'literal'</td>\n",
" <td>'literal'</td>\n",
" <td>'eggs'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#integer'</td>\n",
" <td>'literal'</td>\n",
" <td>'42'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#boolean'</td>\n",
" <td>'literal'</td>\n",
" <td>'false'</td>\n",
" <td>...</td>\n",
" <td>'6.275'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#dateTime'</td>\n",
" <td>'literal'</td>\n",
" <td>'2005-04-04T04:04:04'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#date'</td>\n",
" <td>'literal'</td>\n",
" <td>'2001-01-01'</td>\n",
" <td>'http://www.w3.org/2001/XMLSchema#time'</td>\n",
" <td>'literal'</td>\n",
" <td>'18:30'</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows × 28 columns</p>\n",
"</div>"
],
"text/plain": [
" index/datatype index/type \\\n",
"index/value \n",
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n",
"\n",
" string/type string/value \\\n",
"index/value \n",
"0 'literal' 'spam' \n",
"0 'literal' 'spam' \n",
"0 'literal' 'spam' \n",
"0 'literal' 'spam' \n",
"0 'literal' 'eggs' \n",
"0 'literal' 'eggs' \n",
"0 'literal' 'eggs' \n",
"0 'literal' 'eggs' \n",
"\n",
" integer/datatype integer/type \\\n",
"index/value \n",
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#integer' 'literal' \n",
"\n",
" integer/value boolean/datatype \\\n",
"index/value \n",
"0 '7' 'http://www.w3.org/2001/XMLSchema#boolean' \n",
"0 '7' 'http://www.w3.org/2001/XMLSchema#boolean' \n",
"0 '42' 'http://www.w3.org/2001/XMLSchema#boolean' \n",
"0 '42' 'http://www.w3.org/2001/XMLSchema#boolean' \n",
"0 '7' 'http://www.w3.org/2001/XMLSchema#boolean' \n",
"0 '7' 'http://www.w3.org/2001/XMLSchema#boolean' \n",
"0 '42' 'http://www.w3.org/2001/XMLSchema#boolean' \n",
"0 '42' 'http://www.w3.org/2001/XMLSchema#boolean' \n",
"\n",
" boolean/type boolean/value ... float/value \\\n",
"index/value ... \n",
"0 'literal' 'true' ... '6.275' \n",
"0 'literal' 'false' ... '6.275' \n",
"0 'literal' 'true' ... '6.275' \n",
"0 'literal' 'false' ... '6.275' \n",
"0 'literal' 'true' ... '6.275' \n",
"0 'literal' 'false' ... '6.275' \n",
"0 'literal' 'true' ... '6.275' \n",
"0 'literal' 'false' ... '6.275' \n",
"\n",
" datetime/datatype datetime/type \\\n",
"index/value \n",
"0 'http://www.w3.org/2001/XMLSchema#dateTime' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#dateTime' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#dateTime' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#dateTime' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#dateTime' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#dateTime' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#dateTime' 'literal' \n",
"0 'http://www.w3.org/2001/XMLSchema#dateTime' 'literal' \n",
"\n",
" datetime/value date/datatype \\\n",
"index/value \n",
"0 '2005-04-04T04:04:04' 'http://www.w3.org/2001/XMLSchema#date' \n",
"0 '2005-04-04T04:04:04' 'http://www.w3.org/2001/XMLSchema#date' \n",
"0 '2005-04-04T04:04:04' 'http://www.w3.org/2001/XMLSchema#date' \n",
"0 '2005-04-04T04:04:04' 'http://www.w3.org/2001/XMLSchema#date' \n",
"0 '2005-04-04T04:04:04' 'http://www.w3.org/2001/XMLSchema#date' \n",
"0 '2005-04-04T04:04:04' 'http://www.w3.org/2001/XMLSchema#date' \n",
"0 '2005-04-04T04:04:04' 'http://www.w3.org/2001/XMLSchema#date' \n",
"0 '2005-04-04T04:04:04' 'http://www.w3.org/2001/XMLSchema#date' \n",
"\n",
" date/type date/value time/datatype \\\n",
"index/value \n",
"0 'literal' '2001-01-01' 'http://www.w3.org/2001/XMLSchema#time' \n",
"0 'literal' '2001-01-01' 'http://www.w3.org/2001/XMLSchema#time' \n",
"0 'literal' '2001-01-01' 'http://www.w3.org/2001/XMLSchema#time' \n",
"0 'literal' '2001-01-01' 'http://www.w3.org/2001/XMLSchema#time' \n",
"0 'literal' '2001-01-01' 'http://www.w3.org/2001/XMLSchema#time' \n",
"0 'literal' '2001-01-01' 'http://www.w3.org/2001/XMLSchema#time' \n",
"0 'literal' '2001-01-01' 'http://www.w3.org/2001/XMLSchema#time' \n",
"0 'literal' '2001-01-01' 'http://www.w3.org/2001/XMLSchema#time' \n",
"\n",
" time/type time/value \n",
"index/value \n",
"0 'literal' '18:30' \n",
"0 'literal' '18:30' \n",
"0 'literal' '18:30' \n",
"0 'literal' '18:30' \n",
"0 'literal' '18:30' \n",
"0 'literal' '18:30' \n",
"0 'literal' '18:30' \n",
"0 'literal' '18:30' \n",
"\n",
"[8 rows x 28 columns]"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"JsonNormalizeStrategy(ENDPOINT).query_pandas(TEST_QUERY, raw=True, sep='/').set_index('index/value').pipe(pipe_info).applymap(repr)"
]
},
{
"cell_type": "markdown",
"id": "23939e50-6d14-4cec-abdd-689dee23bf43",
"metadata": {},
"source": [
"## `read_sparql_query(sparql, *, endpoint, strategy)`"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "9bbca0b3-15e8-4e56-b147-f6316c7ad169",
"metadata": {},
"outputs": [],
"source": [
"class ReadSparqlQuery:\n",
"\n",
" @classmethod\n",
" @functools.lru_cache(maxsize=10)\n",
" def from_cache(cls, endpoint: str, *, strategy: str) -> ReadSparqlQuery:\n",
" inst = cls(endpoint, strategy=strategy)\n",
" logging.info('%r', inst)\n",
" return inst\n",
"\n",
" _strategies = {'csv': CsvUrllibStrategy,\n",
" 'csv_sw': CsvSwStrategy,\n",
" 'json': JsonResultStrategy,\n",
" 'json_normalize': JsonNormalizeStrategy,\n",
" 'xml': XmlUrllibStrategy,\n",
" 'xml_sw': XmlSwStrategy,\n",
" 'xml_rdflib': GraphXmlStrategy}\n",
"\n",
" def __init__(self, endpoint: str, *, strategy: str) -> None:\n",
" self.endpoint = endpoint\n",
" assert strategy in self._strategies\n",
" self._strategy = strategy\n",
"\n",
" def __repr__(self) -> str:\n",
" return f'{self.__class__.__name__}({self.endpoint!r}, strategy={self._strategy!r})'\n",
"\n",
" @functools.cached_property\n",
" def strategy(self):\n",
" strategy_cls = self._strategies[self._strategy]\n",
" strategy = strategy_cls(self.endpoint)\n",
" logging.info('strategy: %r', strategy)\n",
" return strategy\n",
"\n",
" def __call__(self, sparql: str, **kwargs) -> pd.DataFrame:\n",
" return self.strategy.query_pandas(sparql, **kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "4ecf6e63-5dba-48b1-8494-47977d822e7d",
"metadata": {},
"outputs": [],
"source": [
"def read_sparql_query(sparql: str, *, convert_dtypes: bool = False,\n",
" endpoint: str, strategy: str, **kwargs) -> pd.DataFrame:\n",
" reader = ReadSparqlQuery.from_cache(endpoint, strategy=strategy)\n",
" df = reader(sparql, **kwargs)\n",
" if convert_dtypes:\n",
" return df.convert_dtypes()\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "c40aa7a7-da7e-4a34-b85c-e742dc50baff",
"metadata": {},
"outputs": [],
"source": [
"read_wikidata = functools.partial(read_sparql_query, QUERY, convert_dtypes=True, endpoint=ENDPOINT)"
]
},
{
"cell_type": "markdown",
"id": "5791761f-99c7-4fb0-aa61-848553864874",
"metadata": {},
"source": [
"### `strategy='csv'`"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "aa002249-49fe-49f7-acd9-8c2021c04139",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='csv')\n",
"[INFO@root] strategy: CsvUrllibStrategy('https://query.wikidata.org/sparql')\n",
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n",
"[INFO@root] request: 'GET' <urllib.request.Request object at 0x000001E1EF0FB640>\n",
"[INFO@root] response: 200 <http.client.HTTPResponse object at 0x000001E1EF0F9B10>\n",
"[INFO@root] content-type: 'text/csv;charset=utf-8'\n",
"[INFO@root] pandas.read_csv(<http.client.HTTPResponse object at 0x000001E1EF0F9B10>, **{'index_col': 'glottocode', 'na_values': '', 'keep_default_na': False, 'encoding': 'utf-8'})\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 15.6 ms\n",
"Wall time: 6.85 s\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>qid</th>\n",
" <th>name</th>\n",
" <th>title</th>\n",
" </tr>\n",
" <tr>\n",
" <th>glottocode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>aant1238</th>\n",
" <td>Q31312216</td>\n",
" <td>Aantantara</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1238</th>\n",
" <td>Q85516014</td>\n",
" <td>Aari-Gayil</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1239</th>\n",
" <td>Q7495</td>\n",
" <td>Aari</td>\n",
" <td>Aari_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1240</th>\n",
" <td>Q4661732</td>\n",
" <td>Aariya</td>\n",
" <td>Aariya_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aasa1238</th>\n",
" <td>Q56620</td>\n",
" <td>Asa</td>\n",
" <td>Asa_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zumb1240</th>\n",
" <td>Q56252</td>\n",
" <td>Zumbun</td>\n",
" <td>Zumbun_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuni1245</th>\n",
" <td>Q10188</td>\n",
" <td>Zuni</td>\n",
" <td>Zuni_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuoj1238</th>\n",
" <td>Q13848149</td>\n",
" <td>Zuojiang Zhuang</td>\n",
" <td>Zuojiang_Zhuang_languages</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuri1238</th>\n",
" <td>Q248682</td>\n",
" <td>Zurich German</td>\n",
" <td>Z%C3%BCrich_German</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zyph1238</th>\n",
" <td>Q57004</td>\n",
" <td>Zyphe</td>\n",
" <td>Zyphe_language</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10873 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" qid name title\n",
"glottocode \n",
"aant1238 Q31312216 Aantantara <NA>\n",
"aari1238 Q85516014 Aari-Gayil <NA>\n",
"aari1239 Q7495 Aari Aari_language\n",
"aari1240 Q4661732 Aariya Aariya_language\n",
"aasa1238 Q56620 Asa Asa_language\n",
"... ... ... ...\n",
"zumb1240 Q56252 Zumbun Zumbun_language\n",
"zuni1245 Q10188 Zuni Zuni_language\n",
"zuoj1238 Q13848149 Zuojiang Zhuang Zuojiang_Zhuang_languages\n",
"zuri1238 Q248682 Zurich German Z%C3%BCrich_German\n",
"zyph1238 Q57004 Zyphe Zyphe_language\n",
"\n",
"[10873 rows x 3 columns]"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%time read_wikidata(strategy='csv', index_col='glottocode')"
]
},
{
"cell_type": "markdown",
"id": "6052a9c6-37a5-4331-9577-f0d1b9eaa7eb",
"metadata": {},
"source": [
"### `strategy='xml'`"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "eb73feff-a3cc-47e3-90c8-d1ff6a7ea564",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='xml')\n",
"[INFO@root] strategy: XmlUrllibStrategy('https://query.wikidata.org/sparql')\n",
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n",
"[INFO@root] request: 'GET' <urllib.request.Request object at 0x000001E1EF0FB640>\n",
"[INFO@root] response: 200 <http.client.HTTPResponse object at 0x000001E1EEB99810>\n",
"[INFO@root] content-type: 'application/sparql-results+xml;charset=utf-8'\n",
"[INFO@root] xml: <Element '{http://www.w3.org/2005/sparql-results#}sparql' at 0x000001E1F119F510>\n",
"[INFO@root] pandas.DataFrame.from_records(<generator object iterrecords at 0x000001E1F03ABE60>, **{'index': 'glottocode', 'columns': ['glottocode', 'qid', 'name', 'title'], 'coerce_float': True})\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 484 ms\n",
"Wall time: 18.7 s\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>qid</th>\n",
" <th>name</th>\n",
" <th>title</th>\n",
" </tr>\n",
" <tr>\n",
" <th>glottocode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>aant1238</th>\n",
" <td>Q31312216</td>\n",
" <td>Aantantara</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1238</th>\n",
" <td>Q85516014</td>\n",
" <td>Aari-Gayil</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1239</th>\n",
" <td>Q7495</td>\n",
" <td>Aari</td>\n",
" <td>Aari_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1240</th>\n",
" <td>Q4661732</td>\n",
" <td>Aariya</td>\n",
" <td>Aariya_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aasa1238</th>\n",
" <td>Q56620</td>\n",
" <td>Asa</td>\n",
" <td>Asa_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zumb1240</th>\n",
" <td>Q56252</td>\n",
" <td>Zumbun</td>\n",
" <td>Zumbun_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuni1245</th>\n",
" <td>Q10188</td>\n",
" <td>Zuni</td>\n",
" <td>Zuni_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuoj1238</th>\n",
" <td>Q13848149</td>\n",
" <td>Zuojiang Zhuang</td>\n",
" <td>Zuojiang_Zhuang_languages</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuri1238</th>\n",
" <td>Q248682</td>\n",
" <td>Zurich German</td>\n",
" <td>Z%C3%BCrich_German</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zyph1238</th>\n",
" <td>Q57004</td>\n",
" <td>Zyphe</td>\n",
" <td>Zyphe_language</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10873 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" qid name title\n",
"glottocode \n",
"aant1238 Q31312216 Aantantara <NA>\n",
"aari1238 Q85516014 Aari-Gayil <NA>\n",
"aari1239 Q7495 Aari Aari_language\n",
"aari1240 Q4661732 Aariya Aariya_language\n",
"aasa1238 Q56620 Asa Asa_language\n",
"... ... ... ...\n",
"zumb1240 Q56252 Zumbun Zumbun_language\n",
"zuni1245 Q10188 Zuni Zuni_language\n",
"zuoj1238 Q13848149 Zuojiang Zhuang Zuojiang_Zhuang_languages\n",
"zuri1238 Q248682 Zurich German Z%C3%BCrich_German\n",
"zyph1238 Q57004 Zyphe Zyphe_language\n",
"\n",
"[10873 rows x 3 columns]"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%time read_wikidata(strategy='xml', index='glottocode')"
]
},
{
"cell_type": "markdown",
"id": "426f9ea7-031c-47fb-89e0-04c17d6ed924",
"metadata": {},
"source": [
"### `strategy='xml_rdflib`"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "c7edcb9e-9a58-4166-acad-e7bf0f7884cb",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='xml_rdflib')\n",
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n",
"[INFO@root] graph: [a rdflib:ConjunctiveGraph;rdflib:storage [a rdflib:Store;rdfs:label 'SPARQLStore']]\n",
"[INFO@root] prefixes: {'schema': Namespace(\"http://schema.org/\")}\n",
"[INFO@root] strategy: <GraphXmlStrategy 'https://query.wikidata.org/sparql' graph=<Graph identifier=Ncb3cf8bf5f4b429fb9d63d2a1a090667 (<class 'rdflib.graph.ConjunctiveGraph'>)>>\n",
"[INFO@root] result: <rdflib.plugins.sparql.results.xmlresults.XMLResult object at 0x000001E1EF0F9C90>\n",
"[INFO@root] pandas.DataFrame.from_records(<generator object read_sparql_query_graph.<locals>.<genexpr> at 0x000001E1F3A80270>, **{'index': 'glottocode', 'columns': ['glottocode', 'qid', 'name', 'title'], 'coerce_float': True})\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 438 ms\n",
"Wall time: 14 s\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>qid</th>\n",
" <th>name</th>\n",
" <th>title</th>\n",
" </tr>\n",
" <tr>\n",
" <th>glottocode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>aant1238</th>\n",
" <td>Q31312216</td>\n",
" <td>Aantantara</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1238</th>\n",
" <td>Q85516014</td>\n",
" <td>Aari-Gayil</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1239</th>\n",
" <td>Q7495</td>\n",
" <td>Aari</td>\n",
" <td>Aari_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1240</th>\n",
" <td>Q4661732</td>\n",
" <td>Aariya</td>\n",
" <td>Aariya_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aasa1238</th>\n",
" <td>Q56620</td>\n",
" <td>Asa</td>\n",
" <td>Asa_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zumb1240</th>\n",
" <td>Q56252</td>\n",
" <td>Zumbun</td>\n",
" <td>Zumbun_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuni1245</th>\n",
" <td>Q10188</td>\n",
" <td>Zuni</td>\n",
" <td>Zuni_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuoj1238</th>\n",
" <td>Q13848149</td>\n",
" <td>Zuojiang Zhuang</td>\n",
" <td>Zuojiang_Zhuang_languages</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuri1238</th>\n",
" <td>Q248682</td>\n",
" <td>Zurich German</td>\n",
" <td>Z%C3%BCrich_German</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zyph1238</th>\n",
" <td>Q57004</td>\n",
" <td>Zyphe</td>\n",
" <td>Zyphe_language</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10873 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" qid name title\n",
"glottocode \n",
"aant1238 Q31312216 Aantantara <NA>\n",
"aari1238 Q85516014 Aari-Gayil <NA>\n",
"aari1239 Q7495 Aari Aari_language\n",
"aari1240 Q4661732 Aariya Aariya_language\n",
"aasa1238 Q56620 Asa Asa_language\n",
"... ... ... ...\n",
"zumb1240 Q56252 Zumbun Zumbun_language\n",
"zuni1245 Q10188 Zuni Zuni_language\n",
"zuoj1238 Q13848149 Zuojiang Zhuang Zuojiang_Zhuang_languages\n",
"zuri1238 Q248682 Zurich German Z%C3%BCrich_German\n",
"zyph1238 Q57004 Zyphe Zyphe_language\n",
"\n",
"[10873 rows x 3 columns]"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%time read_wikidata(strategy='xml_rdflib', index='glottocode')"
]
},
{
"cell_type": "markdown",
"id": "235dc1b7-0e6c-4703-b219-f720bf1e9979",
"metadata": {},
"source": [
"### `strategy='csv_sw'`"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "071cc12d-1dec-490e-b192-c7ce7c8ca274",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='csv_sw')\n",
"[INFO@root] strategy: CsvSwStrategy('https://query.wikidata.org/sparql', returnFormat='csv')\n",
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n",
"[INFO@root] returnFormat: 'csv'\n",
"[INFO@root] CsvSwStrategy('https://query.wikidata.org/sparql', returnFormat='csv').setOnlyConnreg(True)\n",
"[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000001E1EEC008E0>\n",
"[INFO@root] content_type: 'text/csv;charset=utf-8'\n",
"[INFO@root] pandas.read_csv(<http.client.HTTPResponse object at 0x000001E1EEC01D50>, **{'index_col': 'glottocode', 'na_values': '', 'keep_default_na': False, 'encoding': 'utf-8'})\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 15.6 ms\n",
"Wall time: 7.7 s\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>qid</th>\n",
" <th>name</th>\n",
" <th>title</th>\n",
" </tr>\n",
" <tr>\n",
" <th>glottocode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>aant1238</th>\n",
" <td>Q31312216</td>\n",
" <td>Aantantara</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1238</th>\n",
" <td>Q85516014</td>\n",
" <td>Aari-Gayil</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1239</th>\n",
" <td>Q7495</td>\n",
" <td>Aari</td>\n",
" <td>Aari_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1240</th>\n",
" <td>Q4661732</td>\n",
" <td>Aariya</td>\n",
" <td>Aariya_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aasa1238</th>\n",
" <td>Q56620</td>\n",
" <td>Asa</td>\n",
" <td>Asa_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zumb1240</th>\n",
" <td>Q56252</td>\n",
" <td>Zumbun</td>\n",
" <td>Zumbun_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuni1245</th>\n",
" <td>Q10188</td>\n",
" <td>Zuni</td>\n",
" <td>Zuni_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuoj1238</th>\n",
" <td>Q13848149</td>\n",
" <td>Zuojiang Zhuang</td>\n",
" <td>Zuojiang_Zhuang_languages</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuri1238</th>\n",
" <td>Q248682</td>\n",
" <td>Zurich German</td>\n",
" <td>Z%C3%BCrich_German</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zyph1238</th>\n",
" <td>Q57004</td>\n",
" <td>Zyphe</td>\n",
" <td>Zyphe_language</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10873 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" qid name title\n",
"glottocode \n",
"aant1238 Q31312216 Aantantara <NA>\n",
"aari1238 Q85516014 Aari-Gayil <NA>\n",
"aari1239 Q7495 Aari Aari_language\n",
"aari1240 Q4661732 Aariya Aariya_language\n",
"aasa1238 Q56620 Asa Asa_language\n",
"... ... ... ...\n",
"zumb1240 Q56252 Zumbun Zumbun_language\n",
"zuni1245 Q10188 Zuni Zuni_language\n",
"zuoj1238 Q13848149 Zuojiang Zhuang Zuojiang_Zhuang_languages\n",
"zuri1238 Q248682 Zurich German Z%C3%BCrich_German\n",
"zyph1238 Q57004 Zyphe Zyphe_language\n",
"\n",
"[10873 rows x 3 columns]"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%time read_wikidata(strategy='csv_sw', index_col='glottocode')"
]
},
{
"cell_type": "markdown",
"id": "e5eb79e1-5a40-4d66-9951-056fbf3e440d",
"metadata": {},
"source": [
"### `strategy='xml_sw'`"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "6d36993c-766f-431c-af72-b341afbd4689",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='xml_sw')\n",
"[INFO@root] strategy: XmlSwStrategy('https://query.wikidata.org/sparql', returnFormat='xml')\n",
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n",
"[INFO@root] returnFormat: 'xml'\n",
"[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000001E1EEB99E40>\n",
"[INFO@root] content_type: 'application/sparql-results+xml;charset=utf-8'\n",
"[INFO@root] xml: <Element '{http://www.w3.org/2005/sparql-results#}sparql' at 0x000001E1F3318540>\n",
"[INFO@root] pandas.DataFrame.from_records(<generator object iterrecords at 0x000001E1EF2D09E0>, **{'index': 'glottocode', 'columns': ['glottocode', 'qid', 'name', 'title'], 'coerce_float': True})\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 625 ms\n",
"Wall time: 8.97 s\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>qid</th>\n",
" <th>name</th>\n",
" <th>title</th>\n",
" </tr>\n",
" <tr>\n",
" <th>glottocode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>aant1238</th>\n",
" <td>Q31312216</td>\n",
" <td>Aantantara</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1238</th>\n",
" <td>Q85516014</td>\n",
" <td>Aari-Gayil</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1239</th>\n",
" <td>Q7495</td>\n",
" <td>Aari</td>\n",
" <td>Aari_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1240</th>\n",
" <td>Q4661732</td>\n",
" <td>Aariya</td>\n",
" <td>Aariya_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aasa1238</th>\n",
" <td>Q56620</td>\n",
" <td>Asa</td>\n",
" <td>Asa_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zumb1240</th>\n",
" <td>Q56252</td>\n",
" <td>Zumbun</td>\n",
" <td>Zumbun_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuni1245</th>\n",
" <td>Q10188</td>\n",
" <td>Zuni</td>\n",
" <td>Zuni_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuoj1238</th>\n",
" <td>Q13848149</td>\n",
" <td>Zuojiang Zhuang</td>\n",
" <td>Zuojiang_Zhuang_languages</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuri1238</th>\n",
" <td>Q248682</td>\n",
" <td>Zurich German</td>\n",
" <td>Z%C3%BCrich_German</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zyph1238</th>\n",
" <td>Q57004</td>\n",
" <td>Zyphe</td>\n",
" <td>Zyphe_language</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10873 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" qid name title\n",
"glottocode \n",
"aant1238 Q31312216 Aantantara <NA>\n",
"aari1238 Q85516014 Aari-Gayil <NA>\n",
"aari1239 Q7495 Aari Aari_language\n",
"aari1240 Q4661732 Aariya Aariya_language\n",
"aasa1238 Q56620 Asa Asa_language\n",
"... ... ... ...\n",
"zumb1240 Q56252 Zumbun Zumbun_language\n",
"zuni1245 Q10188 Zuni Zuni_language\n",
"zuoj1238 Q13848149 Zuojiang Zhuang Zuojiang_Zhuang_languages\n",
"zuri1238 Q248682 Zurich German Z%C3%BCrich_German\n",
"zyph1238 Q57004 Zyphe Zyphe_language\n",
"\n",
"[10873 rows x 3 columns]"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%time read_wikidata(strategy='xml_sw', index='glottocode')"
]
},
{
"cell_type": "markdown",
"id": "56b5fcd4-15d4-49f1-830d-3aabf15c80ca",
"metadata": {},
"source": [
"### `strategy='json'`"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "4bcf9fc1-2bd3-44ac-bc7d-771484626866",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='json')\n",
"[INFO@root] strategy: JsonResultStrategy('https://query.wikidata.org/sparql', returnFormat='json')\n",
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n",
"[INFO@root] result: <SPARQLWrapper.SmartWrapper.Bindings object at 0x000001E1EEB9B1F0>\n",
"[INFO@root] result.variables: ['glottocode', 'qid', 'name', 'title']\n",
"[INFO@root] pandas.DataFrame.from_records(<generator object read_sparql_query_json_result.<locals>.<genexpr> at 0x000001E1EF26C190>, **{'index': 'glottocode', 'columns': ['glottocode', 'qid', 'name', 'title'], 'coerce_float': True})\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 125 ms\n",
"Wall time: 5.3 s\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>qid</th>\n",
" <th>name</th>\n",
" <th>title</th>\n",
" </tr>\n",
" <tr>\n",
" <th>glottocode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>aant1238</th>\n",
" <td>Q31312216</td>\n",
" <td>Aantantara</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1238</th>\n",
" <td>Q85516014</td>\n",
" <td>Aari-Gayil</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1239</th>\n",
" <td>Q7495</td>\n",
" <td>Aari</td>\n",
" <td>Aari_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1240</th>\n",
" <td>Q4661732</td>\n",
" <td>Aariya</td>\n",
" <td>Aariya_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aasa1238</th>\n",
" <td>Q56620</td>\n",
" <td>Asa</td>\n",
" <td>Asa_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zumb1240</th>\n",
" <td>Q56252</td>\n",
" <td>Zumbun</td>\n",
" <td>Zumbun_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuni1245</th>\n",
" <td>Q10188</td>\n",
" <td>Zuni</td>\n",
" <td>Zuni_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuoj1238</th>\n",
" <td>Q13848149</td>\n",
" <td>Zuojiang Zhuang</td>\n",
" <td>Zuojiang_Zhuang_languages</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuri1238</th>\n",
" <td>Q248682</td>\n",
" <td>Zurich German</td>\n",
" <td>Z%C3%BCrich_German</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zyph1238</th>\n",
" <td>Q57004</td>\n",
" <td>Zyphe</td>\n",
" <td>Zyphe_language</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10873 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" qid name title\n",
"glottocode \n",
"aant1238 Q31312216 Aantantara <NA>\n",
"aari1238 Q85516014 Aari-Gayil <NA>\n",
"aari1239 Q7495 Aari Aari_language\n",
"aari1240 Q4661732 Aariya Aariya_language\n",
"aasa1238 Q56620 Asa Asa_language\n",
"... ... ... ...\n",
"zumb1240 Q56252 Zumbun Zumbun_language\n",
"zuni1245 Q10188 Zuni Zuni_language\n",
"zuoj1238 Q13848149 Zuojiang Zhuang Zuojiang_Zhuang_languages\n",
"zuri1238 Q248682 Zurich German Z%C3%BCrich_German\n",
"zyph1238 Q57004 Zyphe Zyphe_language\n",
"\n",
"[10873 rows x 3 columns]"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%time read_wikidata(strategy='json', index='glottocode')"
]
},
{
"cell_type": "markdown",
"id": "3aa86137-540b-49c0-bdbd-c5d98d269b24",
"metadata": {},
"source": [
"### `strategy='json_normalize'`"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "5642ca34-b0a3-49a9-a715-b3d2262c05cc",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='json_normalize')\n",
"[INFO@root] strategy: JsonNormalizeStrategy('https://query.wikidata.org/sparql', returnFormat='json')\n",
"[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n",
"[INFO@root] returnFormat: 'json'\n",
"[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000001E1EEB9B280>\n",
"[INFO@root] content_type: 'application/sparql-results+json;charset=utf-8'\n",
"[INFO@root] result['head']: {'vars': ['glottocode', 'qid', 'name', 'title']}\n",
"[INFO@root] pandas.json_normalize(jsondata, **{'record_path': ['results', 'bindings']})\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: total: 391 ms\n",
"Wall time: 4.53 s\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>qid</th>\n",
" <th>name</th>\n",
" <th>title</th>\n",
" </tr>\n",
" <tr>\n",
" <th>glottocode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>aant1238</th>\n",
" <td>Q31312216</td>\n",
" <td>Aantantara</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1238</th>\n",
" <td>Q85516014</td>\n",
" <td>Aari-Gayil</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1239</th>\n",
" <td>Q7495</td>\n",
" <td>Aari</td>\n",
" <td>Aari_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aari1240</th>\n",
" <td>Q4661732</td>\n",
" <td>Aariya</td>\n",
" <td>Aariya_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>aasa1238</th>\n",
" <td>Q56620</td>\n",
" <td>Asa</td>\n",
" <td>Asa_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zumb1240</th>\n",
" <td>Q56252</td>\n",
" <td>Zumbun</td>\n",
" <td>Zumbun_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuni1245</th>\n",
" <td>Q10188</td>\n",
" <td>Zuni</td>\n",
" <td>Zuni_language</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuoj1238</th>\n",
" <td>Q13848149</td>\n",
" <td>Zuojiang Zhuang</td>\n",
" <td>Zuojiang_Zhuang_languages</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zuri1238</th>\n",
" <td>Q248682</td>\n",
" <td>Zurich German</td>\n",
" <td>Z%C3%BCrich_German</td>\n",
" </tr>\n",
" <tr>\n",
" <th>zyph1238</th>\n",
" <td>Q57004</td>\n",
" <td>Zyphe</td>\n",
" <td>Zyphe_language</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10873 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" qid name title\n",
"glottocode \n",
"aant1238 Q31312216 Aantantara <NA>\n",
"aari1238 Q85516014 Aari-Gayil <NA>\n",
"aari1239 Q7495 Aari Aari_language\n",
"aari1240 Q4661732 Aariya Aariya_language\n",
"aasa1238 Q56620 Asa Asa_language\n",
"... ... ... ...\n",
"zumb1240 Q56252 Zumbun Zumbun_language\n",
"zuni1245 Q10188 Zuni Zuni_language\n",
"zuoj1238 Q13848149 Zuojiang Zhuang Zuojiang_Zhuang_languages\n",
"zuri1238 Q248682 Zurich German Z%C3%BCrich_German\n",
"zyph1238 Q57004 Zyphe Zyphe_language\n",
"\n",
"[10873 rows x 3 columns]"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%time read_wikidata(strategy='json_normalize').set_index('glottocode')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment