Skip to content

Instantly share code, notes, and snippets.

@jmkeil
Last active April 5, 2022 12:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jmkeil/3ef8516183cfd15232d0c84547da369e to your computer and use it in GitHub Desktop.
Save jmkeil/3ef8516183cfd15232d0c84547da369e to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import gzip\n",
"import numpy as np\n",
"import pandas as pd\n",
"from tqdm.notebook import tqdm\n",
"from IPython.display import display, HTML"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"numericDatatypes = ['http://www.w3.org/2001/XMLSchema#decimal',\n",
" 'http://www.w3.org/2001/XMLSchema#integer',\n",
" 'http://www.w3.org/2001/XMLSchema#double',\n",
" 'http://www.w3.org/2001/XMLSchema#float',\n",
" 'http://www.w3.org/2001/XMLSchema#byte',\n",
" 'http://www.w3.org/2001/XMLSchema#short',\n",
" 'http://www.w3.org/2001/XMLSchema#int',\n",
" 'http://www.w3.org/2001/XMLSchema#long',\n",
" 'http://www.w3.org/2001/XMLSchema#unsignedByte',\n",
" 'http://www.w3.org/2001/XMLSchema#unsignedShort',\n",
" 'http://www.w3.org/2001/XMLSchema#unsignedInt',\n",
" 'http://www.w3.org/2001/XMLSchema#unsignedLong',\n",
" 'http://www.w3.org/2001/XMLSchema#positiveInteger',\n",
" 'http://www.w3.org/2001/XMLSchema#nonNegativeInteger',\n",
" 'http://www.w3.org/2001/XMLSchema#negativeInteger',\n",
" 'http://www.w3.org/2001/XMLSchema#nonPositiveInteger',\n",
" 'http://www.w3.org/2002/07/owl#rational',\n",
" 'http://www.w3.org/2002/07/owl#real',\n",
" 'http://schema.org/Number',\n",
" 'http://schema.org/Integer',\n",
" 'http://schema.org/Float']\n",
"temporalDatatypes = ['http://www.w3.org/2001/XMLSchema#date',\n",
" 'http://www.w3.org/2001/XMLSchema#time',\n",
" 'http://www.w3.org/2001/XMLSchema#dateTime',\n",
" 'http://www.w3.org/2001/XMLSchema#dateTimeStamp',\n",
" 'http://www.w3.org/2001/XMLSchema#gYear',\n",
" 'http://www.w3.org/2001/XMLSchema#gMonth',\n",
" 'http://www.w3.org/2001/XMLSchema#gDay',\n",
" 'http://www.w3.org/2001/XMLSchema#gYearMonth',\n",
" 'http://www.w3.org/2001/XMLSchema#gMonthDay',\n",
" 'http://www.w3.org/2001/XMLSchema#duration',\n",
" 'http://www.w3.org/2001/XMLSchema#yearMonthDuration',\n",
" 'http://www.w3.org/2001/XMLSchema#dayTimeDuration',\n",
" 'http://schema.org/DateTime',\n",
" 'http://schema.org/Time',\n",
" 'http://schema.org/Date']"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "fb830868a9054f57838eb55fa1efd463",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/53641457 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ea6870deea794a03a8575fd0d6d4cb54",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/20361829 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def loadData(file,rowCount):\n",
" data = None\n",
" progress = tqdm(total=rowCount)\n",
" for chunk in (pd.read_csv(file, usecols=['CATEGORY','MEASUREMENT','PROPERTY','DATATYPE','QUANTITY'], chunksize=100000)):\n",
" # aggregate and transform data\n",
" chunkPivot = pd.pivot_table(chunk, values='QUANTITY', index=['CATEGORY','DATATYPE','PROPERTY'], columns='MEASUREMENT', aggfunc=np.sum, fill_value=0)\n",
" # assert precense of all columns to avoid NaN values later on\n",
" for column in ['UnpreciseRepresentableInDouble',\n",
" 'UnpreciseRepresentableInFloat',\n",
" 'UsedAsDatatype',\n",
" 'UsedAsPropertyRange',\n",
" 'ValidDateNotation',\n",
" 'ValidDateTimeNotation',\n",
" 'ValidDecimalNotation',\n",
" 'ValidExponentialNotation',\n",
" 'ValidInfOrNaNNotation',\n",
" 'ValidIntegerNotation',\n",
" 'ValidTimeNotation',\n",
" 'ValidTrueOrFalseNotation',\n",
" 'ValidZeroOrOneNotation']:\n",
" if column not in chunkPivot.columns:\n",
" chunkPivot[column]=0\n",
" if data is None:\n",
" data = chunkPivot\n",
" else:\n",
" data = data.add(chunkPivot,fill_value=0)\n",
" # update progress bar\n",
" progress.update(chunk.shape[0])\n",
" data = data.sort_values(by='UsedAsDatatype', ascending=False)\n",
" data = data.astype({'UnpreciseRepresentableInDouble': 'int64',\n",
" 'UnpreciseRepresentableInFloat': 'int64',\n",
" 'UsedAsDatatype': 'int64',\n",
" 'UsedAsPropertyRange': 'int64',\n",
" 'ValidDateNotation': 'int64',\n",
" 'ValidDateTimeNotation': 'int64',\n",
" 'ValidDecimalNotation': 'int64',\n",
" 'ValidExponentialNotation': 'int64',\n",
" 'ValidInfOrNaNNotation': 'int64',\n",
" 'ValidIntegerNotation': 'int64',\n",
" 'ValidTimeNotation': 'int64',\n",
" 'ValidTrueOrFalseNotation': 'int64',\n",
" 'ValidZeroOrOneNotation': 'int64'},copy=False)\n",
" return data\n",
"\n",
"data2020 = loadData('December2020/measurements.csv.gz',53641457) # https://doi.org/10.5281/zenodo.6205111\n",
"data2021 = loadData('October2021/measurements.csv.gz',20361829) # https://doi.org/10.5281/zenodo.6337661"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# cleanup\n",
"def cleanup(data):\n",
" data = data.copy()\n",
" data.reset_index(inplace=True)\n",
" ## unify http and https\n",
" data['DATATYPE']=data['DATATYPE'].str.replace('^https:', 'http:', case=False)\n",
" ## fix missing namespace definitions\n",
" data['DATATYPE']=data['DATATYPE'].str.replace('^rdf:', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', case=True)\n",
" data['DATATYPE']=data['DATATYPE'].str.replace('^xsd:', 'http://www.w3.org/2001/XMLSchema#', case=True)\n",
" data['DATATYPE']=data['DATATYPE'].str.replace('^schema:', 'http://schema.org/', case=True)\n",
" data['DATATYPE']=data['DATATYPE'].str.replace('^dcterms:', 'http://purl.org/dc/terms/', case=True)\n",
" data['DATATYPE']=data['DATATYPE'].str.replace('^use:', 'http://search.yahoo.com/searchmonkey-datatype/use/', case=True)\n",
" data['DATATYPE']=data['DATATYPE'].str.replace('^overheid:', 'http://standaarden.overheid.nl/owms/terms/', case=True)\n",
" data = data.groupby(['CATEGORY','DATATYPE','PROPERTY']).sum()\n",
" return data\n",
" \n",
"dataCleaned2020 = cleanup(data2020)\n",
"dataCleaned2021 = cleanup(data2021)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# number of literals per category\n",
"usageByCategory2020 = dataCleaned2020.groupby(['CATEGORY']).sum().filter(items=['UsedAsDatatype'])\n",
"usageByCategory2021 = dataCleaned2021.groupby(['CATEGORY']).sum().filter(items=['UsedAsDatatype'])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# datatypes per source category by occurrence\n",
"def datatypeOccurrences(data,usageByCategory):\n",
" usageByCategoryDatatype = data.loc[data['UsedAsDatatype'] > 0].groupby(['CATEGORY','DATATYPE']).sum().filter(items=['UsedAsDatatype'])\n",
" display(HTML(\n",
" usageByCategoryDatatype.join(usageByCategory.rename(columns = {'UsedAsDatatype':'tmp_UsedAsDatatype'}))\\\n",
" .assign(UsedAsDatatypeRatio=lambda x: round(x['UsedAsDatatype']/x['tmp_UsedAsDatatype'],2))\\\n",
" .drop(columns='tmp_UsedAsDatatype')\\\n",
" .sort_values(['CATEGORY','UsedAsDatatype'], ascending=False)\\\n",
" .to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# lexical representation of temporal datatypes per source category\n",
"def lexicalRepresentationOfTemporals(data,usageByCategory):\n",
" usageByCategoryDatatype = data.loc[data['UsedAsDatatype'] > 0].loc(axis=0)[:,temporalDatatypes].groupby(['CATEGORY','DATATYPE']).sum()\n",
" display(HTML(\n",
" usageByCategoryDatatype.join(usageByCategory.rename(columns = {'UsedAsDatatype':'tmp_UsedAsDatatype'}))\\\n",
" .assign(UsedAsDatatypeRatio=lambda x: round(x['UsedAsDatatype']/x['tmp_UsedAsDatatype'],2))\\\n",
" .drop(columns='tmp_UsedAsDatatype')\\\n",
" .assign(ValidDateNotationRatio=lambda x: round(x['ValidDateNotation']/x['UsedAsDatatype'],2))\\\n",
" .assign(ValidDateTimeNotationRatio=lambda x: round(x['ValidDateTimeNotation']/x['UsedAsDatatype'],2))\\\n",
" .assign(ValidTimeNotationRatio=lambda x: round(x['ValidTimeNotation']/x['UsedAsDatatype'],2))\\\n",
" .filter(items=['UsedAsDatatype','UsedAsDatatypeRatio',\n",
" 'ValidDateNotation', 'ValidDateNotationRatio',\n",
" 'ValidDateTimeNotation', 'ValidDateTimeNotationRatio',\n",
" 'ValidTimeNotation', 'ValidTimeNotationRatio'])\\\n",
" .sort_values(['CATEGORY','UsedAsDatatype'], ascending=False)\\\n",
" .to_html()))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Usage of Datatypes in Web Data Commons (December 2020)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MEASUREMENT</th>\n",
" <th>UsedAsDatatype</th>\n",
" <th>UsedAsDatatypeRatio</th>\n",
" </tr>\n",
" <tr>\n",
" <th>CATEGORY</th>\n",
" <th>DATATYPE</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"99\" valign=\"top\">html-rdfa</th>\n",
" <th>http://www.w3.org/1999/02/22-rdf-syntax-ns#langString</th>\n",
" <td>3179161585</td>\n",
" <td>0.68</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#string</th>\n",
" <td>1305371136</td>\n",
" <td>0.28</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#dateTime</th>\n",
" <td>102987223</td>\n",
" <td>0.02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral</th>\n",
" <td>62337177</td>\n",
" <td>0.01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#integer</th>\n",
" <td>21547053</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#float</th>\n",
" <td>1025753</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://search.yahoo.com/searchmonkey-datatype/use/sku</th>\n",
" <td>729858</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#date</th>\n",
" <td>507454</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#boolean</th>\n",
" <td>348334</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/Date</th>\n",
" <td>246995</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#time</th>\n",
" <td>197426</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#anyURI</th>\n",
" <td>30629</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYear</th>\n",
" <td>26911</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/Period</th>\n",
" <td>25418</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/W3CDTF</th>\n",
" <td>24782</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/RFC4646</th>\n",
" <td>24772</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/URI</th>\n",
" <td>24750</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#language</th>\n",
" <td>22725</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/ISO3166</th>\n",
" <td>22306</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#int</th>\n",
" <td>19427</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://standaarden.overheid.nl/owms/terms/informatietype</th>\n",
" <td>12911</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#datetime</th>\n",
" <td>9461</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#decimal</th>\n",
" <td>8288</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://id.loc.gov/vocabulary/iso639-2/eng</th>\n",
" <td>3669</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/1999/02/22-rdf-syntax-ns#HTML</th>\n",
" <td>2809</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#datedate</th>\n",
" <td>2238</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://search.yahoo.com/searchmonkey-datatype/use/upc</th>\n",
" <td>1286</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#floatfloat</th>\n",
" <td>1209</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#interval</th>\n",
" <td>1105</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>xmlns:fb=</th>\n",
" <td>1061</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://search.yahoo.com/searchmonkey-datatype/use/isbn</th>\n",
" <td>1048</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://search.yahoo.com/searchmonkey-datatype/use/MPN</th>\n",
" <td>867</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#duration</th>\n",
" <td>835</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://standaarden.overheid.nl/owms/terms/Doelgroep</th>\n",
" <td>720</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#normalizedString</th>\n",
" <td>718</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://w3.org/2001/XMLSchema#float</th>\n",
" <td>684</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>XSD:DATETIME</th>\n",
" <td>650</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#nonNegativeInteger</th>\n",
" <td>586</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/date</th>\n",
" <td>448</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchemadateTime</th>\n",
" <td>409</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://w3.org/2001/XMLSchema#integer</th>\n",
" <td>388</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#double</th>\n",
" <td>234</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/text</th>\n",
" <td>191</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://ns.ontowiki.net/SysOnt/Markdown</th>\n",
" <td>148</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dct:ISO3166</th>\n",
" <td>90</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#timetime</th>\n",
" <td>90</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/1999/xhtml</th>\n",
" <td>83</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#Locality</th>\n",
" <td>80</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#Pcode</th>\n",
" <td>80</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#Region</th>\n",
" <td>80</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#Street</th>\n",
" <td>80</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://zeenews.india.com/</th>\n",
" <td>70</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/IMT</th>\n",
" <td>64</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dc:ISO3166</th>\n",
" <td>59</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYearMonth</th>\n",
" <td>58</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/LCSH</th>\n",
" <td>53</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#Name</th>\n",
" <td>50</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>us-mhtt-rdfw3.w.g/1999/02/22-rdf-syntax-ns#XMLLiteral</th>\n",
" <td>45</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>/sch:mawww.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral</th>\n",
" <td>39</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>tags:scwww.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral</th>\n",
" <td>37</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/DCMIType</th>\n",
" <td>24</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>units:mm</th>\n",
" <td>21</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/LCC</th>\n",
" <td>15</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.spacious.hk/zh-tw/</th>\n",
" <td>14</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#hexBinary</th>\n",
" <td>13</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http:///wwww3.org/2001/XMLSchema#dateTime</th>\n",
" <td>10</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ddc:Notation</th>\n",
" <td>9</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http:///wwww3.org/1999/02/22-rdf-syntax-ns#XMLLiteral</th>\n",
" <td>9</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>DCTERMS:URI</th>\n",
" <td>8</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/RFC1766</th>\n",
" <td>8</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://standaarden.overheid.nl/owms/terms/bekendmaking</th>\n",
" <td>8</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/ISO3166ISO3166ISO3166</th>\n",
" <td>7</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/DateTime</th>\n",
" <td>7</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://labelleestlabete.fr</th>\n",
" <td>6</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/w3cdtf</th>\n",
" <td>6</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/1999/XMLSchemadate</th>\n",
" <td>6</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#positiveInteger</th>\n",
" <td>6</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>us-mhtt-rdfw3.w.g/2001/XMLSchema#dateTime</th>\n",
" <td>6</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/Text</th>\n",
" <td>5</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.theincontinencestore.com/product/Premium</th>\n",
" <td>5</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>/sch:mawww.w3.org/2001/XMLSchema#integer</th>\n",
" <td>4</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#dataType</th>\n",
" <td>4</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>tags:scwww.w3.org/2001/XMLSchema#integer</th>\n",
" <td>4</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://id.loc.gov/datatypes/orgs/code</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://id.loc.gov/datatypes/orgs/iso15511</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://id.loc.gov/datatypes/orgs/normalized</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/dcmitype</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/rfc4646</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/uri</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.opengis.net/ont/geosparql#wktLiteral</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/1999/02/22-rdf-syntax-ns#HTL</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>us-mhtt-rdfw3.w.g/2001/XMLSchema#integer</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>xs:date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://parentsecurityonline.com/</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#d%3C/span</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#dataTime</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#dateT%3C/span</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#integerinteger</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>xs:anyURI</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"54\" valign=\"top\">html-embedded-jsonld</th>\n",
" <th>http://www.w3.org/2001/XMLSchema#string</th>\n",
" <td>11277500571</td>\n",
" <td>0.76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#integer</th>\n",
" <td>2021243795</td>\n",
" <td>0.14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/Date</th>\n",
" <td>1313408439</td>\n",
" <td>0.09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#double</th>\n",
" <td>101959406</td>\n",
" <td>0.01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#boolean</th>\n",
" <td>26144338</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/DateTime</th>\n",
" <td>25002464</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/1999/02/22-rdf-syntax-ns#langString</th>\n",
" <td>12934431</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#float</th>\n",
" <td>90895</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#dateTime</th>\n",
" <td>12260</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/1999/02/22-rdf-syntax-ns#HTML</th>\n",
" <td>5785</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#date</th>\n",
" <td>3398</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://gs1.org/voc/AI</th>\n",
" <td>2601</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>tags:scwww.w3.org/2001/XMLSchema#integer</th>\n",
" <td>2533</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>us-mhtt-rdfw3.w.g/2001/XMLSchema#integer</th>\n",
" <td>2221</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>/sch:mawww.w3.org/2001/XMLSchema#integer</th>\n",
" <td>2199</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/Distance</th>\n",
" <td>1660</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>tags:scschema.org/Date</th>\n",
" <td>1606</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>/sch:maschema.org/Date</th>\n",
" <td>1584</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#time</th>\n",
" <td>1428</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>us-mhttschema.w.g/Date</th>\n",
" <td>1337</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http:///wwww3.org/2001/XMLSchema#integer</th>\n",
" <td>1030</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#anyURI</th>\n",
" <td>273</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>/sch:mawww.w3.org/2001/XMLSchema#double</th>\n",
" <td>214</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>tags:scwww.w3.org/2001/XMLSchema#double</th>\n",
" <td>170</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>us-mhtt-rdfw3.w.g/2001/XMLSchema#double</th>\n",
" <td>127</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>us-m:tt-rdfw3.w.g/2001/XMLSchema#integer</th>\n",
" <td>95</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>/sch:mawww.w3.org/2001/XMLSchema#boolean</th>\n",
" <td>64</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#String</th>\n",
" <td>64</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http:///wwww3.org/2001/XMLSchema#double</th>\n",
" <td>50</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>us-m:ttschema.w.g/Date</th>\n",
" <td>40</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#long</th>\n",
" <td>36</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYear</th>\n",
" <td>34</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>tags:scwww.w3.org/2001/XMLSchema#boolean</th>\n",
" <td>34</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>us-mhtt-rdfw3.w.g/2001/XMLSchema#boolean</th>\n",
" <td>28</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>us-mhttschema.w.g/Datl</th>\n",
" <td>26</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYearMonth</th>\n",
" <td>20</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>tags:scschema.org/DateTime</th>\n",
" <td>20</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#language</th>\n",
" <td>16</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#intcger</th>\n",
" <td>13</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http:///wwww3.org/2001/XMLSchema#boolean</th>\n",
" <td>12</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20191212074322/http://schema.org/Date</th>\n",
" <td>10</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.jobmixer.com/jobmeta.xsd#rgb</th>\n",
" <td>9</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.iana.org/assignments/media-types/application/vnd.geo+json</th>\n",
" <td>8</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.opengis.net/ont/geosparql#wktLiteral</th>\n",
" <td>8</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>us-mhttschema.w.g/DateTime</th>\n",
" <td>6</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200108234821/http://schema.org/Date</th>\n",
" <td>4</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#Number</th>\n",
" <td>4</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20190422212435/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#duration</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>/sch:maschema.org/DateTime</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/Rating</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.lightsspectacular.com/2019/XMLSchema#dateTime</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#decimal</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>us-m:tt-rdfw3.w.g/2001/XMLSchema#boolean</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"datatypeOccurrences(dataCleaned2020,usageByCategory2020)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Usage of Datatypes in Web Data Commons (October 2021)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MEASUREMENT</th>\n",
" <th>UsedAsDatatype</th>\n",
" <th>UsedAsDatatypeRatio</th>\n",
" </tr>\n",
" <tr>\n",
" <th>CATEGORY</th>\n",
" <th>DATATYPE</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"41\" valign=\"top\">html-rdfa</th>\n",
" <th>http://www.w3.org/2001/XMLSchema#string</th>\n",
" <td>388251350</td>\n",
" <td>0.94</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#dateTime</th>\n",
" <td>19355530</td>\n",
" <td>0.05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#integer</th>\n",
" <td>3875545</td>\n",
" <td>0.01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#float</th>\n",
" <td>261955</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://search.yahoo.com/searchmonkey-datatype/use/sku</th>\n",
" <td>189303</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#boolean</th>\n",
" <td>105241</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#time</th>\n",
" <td>32406</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#date</th>\n",
" <td>21835</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/ISO3166</th>\n",
" <td>6090</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#datetime</th>\n",
" <td>2546</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#int</th>\n",
" <td>2528</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#duration</th>\n",
" <td>2434</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#anyURI</th>\n",
" <td>2161</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#datedate</th>\n",
" <td>1462</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#decimal</th>\n",
" <td>1165</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#floatfloat</th>\n",
" <td>789</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#normalizedString</th>\n",
" <td>410</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://search.yahoo.com/searchmonkey-datatype/use/upc</th>\n",
" <td>346</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral</th>\n",
" <td>188</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/1999/xhtml</th>\n",
" <td>124</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchemadate</th>\n",
" <td>117</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://search.yahoo.com/searchmonkey-datatype/use/MPN</th>\n",
" <td>64</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#timetime</th>\n",
" <td>58</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchemadateTime</th>\n",
" <td>39</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYear</th>\n",
" <td>37</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#Name</th>\n",
" <td>28</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#language</th>\n",
" <td>7</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://ns.ontowiki.net/SysOnt/Markdown</th>\n",
" <td>6</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org.leonakinggallery.com/2001/XMLSchema#dateTime</th>\n",
" <td>4</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://search.yahoo.com/searchmonkey-datatype/use/ean</th>\n",
" <td>3</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org.kfoxradio.com/2001/XMLSchema#dateTime</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org.lechatdesucre.com/2001/XMLSchema#dateTime</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org.moirabreen.com/2001/XMLSchema#dateTime</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org.peterbernik.com/2001/XMLSchema#dateTime</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchemastring</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/ns/auth/cert#hex</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/ns/auth/cert#int</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://270a.info/dateTime</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://purl.org/dc/terms/ISO3166ISO3166ISO3166</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://w3.org/2001/XMLSchema#integer</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#integerinteger</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"70\" valign=\"top\">html-embedded-jsonld</th>\n",
" <th>http://www.w3.org/2001/XMLSchema#string</th>\n",
" <td>13093783244</td>\n",
" <td>0.76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#integer</th>\n",
" <td>2593727914</td>\n",
" <td>0.15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/Date</th>\n",
" <td>1384640759</td>\n",
" <td>0.08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#double</th>\n",
" <td>150589610</td>\n",
" <td>0.01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#boolean</th>\n",
" <td>30496882</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/DateTime</th>\n",
" <td>24827352</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/1999/02/22-rdf-syntax-ns#langString</th>\n",
" <td>13424536</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#float</th>\n",
" <td>96972</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#dateTime</th>\n",
" <td>40916</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/Text</th>\n",
" <td>14466</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/1999/02/22-rdf-syntax-ns#HTML</th>\n",
" <td>11982</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#date</th>\n",
" <td>7242</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://gs1.org/voc/AI</th>\n",
" <td>2601</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#time</th>\n",
" <td>2254</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/Distance</th>\n",
" <td>1852</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYear</th>\n",
" <td>528</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>o-module-numeric-xsd:integer</th>\n",
" <td>509</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#anyURI</th>\n",
" <td>471</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20190331015455/http://schema.org/Date</th>\n",
" <td>116</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYearMonth</th>\n",
" <td>95</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#String</th>\n",
" <td>49</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#decimal</th>\n",
" <td>27</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://blog.bendevoficial.com/posts/Text</th>\n",
" <td>22</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20190330140038/http://schema.org/Date</th>\n",
" <td>16</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20190330202647/http://schema.org/Date</th>\n",
" <td>16</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#duration</th>\n",
" <td>14</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20190810002819/http://schema.org/Date</th>\n",
" <td>12</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#anyUri</th>\n",
" <td>8</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://blog.bendevoficial.com/Text</th>\n",
" <td>6</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20190520152016/http://schema.org/Date</th>\n",
" <td>4</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20191129090855/http://schema.org/Date</th>\n",
" <td>4</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.jobmixer.com/jobmeta.xsd#rgb</th>\n",
" <td>4</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#hexBinary</th>\n",
" <td>4</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20190520153034/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20191002120100/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20191201141148/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200203183144/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200216052208/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200226032923/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200226112742/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200226132450/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200308170457/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200409074300/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200417063602/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200422032746/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200424224913/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200424231827/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200424232231/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200424232742/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200424233357/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200424233438/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200425173607/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200425202134/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200425205843/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200425213329/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200425230318/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200427053705/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200427053806/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200427065455/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200427094022/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200428003135/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200428011543/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200428042912/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200428045617/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200428113319/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200502140109/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200502153051/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20200503060821/http://schema.org/Date</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://web.archive.org/web/20190515215654/http://schema.org/Date</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.lightsspectacular.com/2019/XMLSchema#dateTime</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"datatypeOccurrences(dataCleaned2021,usageByCategory2021)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Usage of Numeric Datatypes in Web Data Commons (December 2020)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MEASUREMENT</th>\n",
" <th>UsedAsDatatype</th>\n",
" <th>UsedAsDatatypeRatio</th>\n",
" </tr>\n",
" <tr>\n",
" <th>CATEGORY</th>\n",
" <th>DATATYPE</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"7\" valign=\"top\">html-rdfa</th>\n",
" <th>http://www.w3.org/2001/XMLSchema#integer</th>\n",
" <td>21547053</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#float</th>\n",
" <td>1025753</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#int</th>\n",
" <td>19427</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#decimal</th>\n",
" <td>8288</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#nonNegativeInteger</th>\n",
" <td>586</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#double</th>\n",
" <td>234</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#positiveInteger</th>\n",
" <td>6</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">html-embedded-jsonld</th>\n",
" <th>http://www.w3.org/2001/XMLSchema#integer</th>\n",
" <td>2021243795</td>\n",
" <td>0.14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#double</th>\n",
" <td>101959406</td>\n",
" <td>0.01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#float</th>\n",
" <td>90895</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#long</th>\n",
" <td>36</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#decimal</th>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"datatypeOccurrences(dataCleaned2020.loc(axis=0)[:,numericDatatypes],usageByCategory2020)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Usage of Numeric Datatypes in Web Data Commons (October 2021)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MEASUREMENT</th>\n",
" <th>UsedAsDatatype</th>\n",
" <th>UsedAsDatatypeRatio</th>\n",
" </tr>\n",
" <tr>\n",
" <th>CATEGORY</th>\n",
" <th>DATATYPE</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"4\" valign=\"top\">html-rdfa</th>\n",
" <th>http://www.w3.org/2001/XMLSchema#integer</th>\n",
" <td>3875545</td>\n",
" <td>0.01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#float</th>\n",
" <td>261955</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#int</th>\n",
" <td>2528</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#decimal</th>\n",
" <td>1165</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"4\" valign=\"top\">html-embedded-jsonld</th>\n",
" <th>http://www.w3.org/2001/XMLSchema#integer</th>\n",
" <td>2593727914</td>\n",
" <td>0.15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#double</th>\n",
" <td>150589610</td>\n",
" <td>0.01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#float</th>\n",
" <td>96972</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#decimal</th>\n",
" <td>27</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"datatypeOccurrences(dataCleaned2021.loc(axis=0)[:,numericDatatypes],usageByCategory2021)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Usage of Temporal Datatypes in Web Data Commons (December 2020)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MEASUREMENT</th>\n",
" <th>UsedAsDatatype</th>\n",
" <th>UsedAsDatatypeRatio</th>\n",
" </tr>\n",
" <tr>\n",
" <th>CATEGORY</th>\n",
" <th>DATATYPE</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"8\" valign=\"top\">html-rdfa</th>\n",
" <th>http://www.w3.org/2001/XMLSchema#dateTime</th>\n",
" <td>102987223</td>\n",
" <td>0.02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#date</th>\n",
" <td>507454</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/Date</th>\n",
" <td>246995</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#time</th>\n",
" <td>197426</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYear</th>\n",
" <td>26911</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#duration</th>\n",
" <td>835</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYearMonth</th>\n",
" <td>58</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/DateTime</th>\n",
" <td>7</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"8\" valign=\"top\">html-embedded-jsonld</th>\n",
" <th>http://schema.org/Date</th>\n",
" <td>1313408439</td>\n",
" <td>0.09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/DateTime</th>\n",
" <td>25002464</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#dateTime</th>\n",
" <td>12260</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#date</th>\n",
" <td>3398</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#time</th>\n",
" <td>1428</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYear</th>\n",
" <td>34</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYearMonth</th>\n",
" <td>20</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#duration</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"datatypeOccurrences(dataCleaned2020.loc(axis=0)[:,temporalDatatypes],usageByCategory2020)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Usage of Temporal Datatypes in Web Data Commons (October 2021)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MEASUREMENT</th>\n",
" <th>UsedAsDatatype</th>\n",
" <th>UsedAsDatatypeRatio</th>\n",
" </tr>\n",
" <tr>\n",
" <th>CATEGORY</th>\n",
" <th>DATATYPE</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">html-rdfa</th>\n",
" <th>http://www.w3.org/2001/XMLSchema#dateTime</th>\n",
" <td>19355530</td>\n",
" <td>0.05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#time</th>\n",
" <td>32406</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#date</th>\n",
" <td>21835</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#duration</th>\n",
" <td>2434</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYear</th>\n",
" <td>37</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"8\" valign=\"top\">html-embedded-jsonld</th>\n",
" <th>http://schema.org/Date</th>\n",
" <td>1384640759</td>\n",
" <td>0.08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/DateTime</th>\n",
" <td>24827352</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#dateTime</th>\n",
" <td>40916</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#date</th>\n",
" <td>7242</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#time</th>\n",
" <td>2254</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYear</th>\n",
" <td>528</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYearMonth</th>\n",
" <td>95</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#duration</th>\n",
" <td>14</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"datatypeOccurrences(dataCleaned2021.loc(axis=0)[:,temporalDatatypes],usageByCategory2021)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Lexical Representation of Temporal Literals in Web Data Commons (December 2020)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MEASUREMENT</th>\n",
" <th>UsedAsDatatype</th>\n",
" <th>UsedAsDatatypeRatio</th>\n",
" <th>ValidDateNotation</th>\n",
" <th>ValidDateNotationRatio</th>\n",
" <th>ValidDateTimeNotation</th>\n",
" <th>ValidDateTimeNotationRatio</th>\n",
" <th>ValidTimeNotation</th>\n",
" <th>ValidTimeNotationRatio</th>\n",
" </tr>\n",
" <tr>\n",
" <th>CATEGORY</th>\n",
" <th>DATATYPE</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"8\" valign=\"top\">html-rdfa</th>\n",
" <th>http://www.w3.org/2001/XMLSchema#dateTime</th>\n",
" <td>102987223</td>\n",
" <td>0.02</td>\n",
" <td>74220</td>\n",
" <td>0.00</td>\n",
" <td>102607586</td>\n",
" <td>1.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#date</th>\n",
" <td>507454</td>\n",
" <td>0.00</td>\n",
" <td>367919</td>\n",
" <td>0.73</td>\n",
" <td>18905</td>\n",
" <td>0.04</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/Date</th>\n",
" <td>246995</td>\n",
" <td>0.00</td>\n",
" <td>368</td>\n",
" <td>0.00</td>\n",
" <td>246219</td>\n",
" <td>1.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#time</th>\n",
" <td>197426</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>141612</td>\n",
" <td>0.72</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYear</th>\n",
" <td>26911</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>2396</td>\n",
" <td>0.09</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#duration</th>\n",
" <td>835</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYearMonth</th>\n",
" <td>58</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>28</td>\n",
" <td>0.48</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/DateTime</th>\n",
" <td>7</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>7</td>\n",
" <td>1.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"8\" valign=\"top\">html-embedded-jsonld</th>\n",
" <th>http://schema.org/Date</th>\n",
" <td>1313408439</td>\n",
" <td>0.09</td>\n",
" <td>176077265</td>\n",
" <td>0.13</td>\n",
" <td>967033365</td>\n",
" <td>0.74</td>\n",
" <td>1143</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/DateTime</th>\n",
" <td>25002464</td>\n",
" <td>0.00</td>\n",
" <td>6069504</td>\n",
" <td>0.24</td>\n",
" <td>10580352</td>\n",
" <td>0.42</td>\n",
" <td>94628</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#dateTime</th>\n",
" <td>12260</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>11668</td>\n",
" <td>0.95</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#date</th>\n",
" <td>3398</td>\n",
" <td>0.00</td>\n",
" <td>3274</td>\n",
" <td>0.96</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#time</th>\n",
" <td>1428</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>1428</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYear</th>\n",
" <td>34</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYearMonth</th>\n",
" <td>20</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#duration</th>\n",
" <td>2</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"lexicalRepresentationOfTemporals(dataCleaned2020,usageByCategory2020)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Lexical Representation of Temporal Literals in Web Data Commons (October 2021)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MEASUREMENT</th>\n",
" <th>UsedAsDatatype</th>\n",
" <th>UsedAsDatatypeRatio</th>\n",
" <th>ValidDateNotation</th>\n",
" <th>ValidDateNotationRatio</th>\n",
" <th>ValidDateTimeNotation</th>\n",
" <th>ValidDateTimeNotationRatio</th>\n",
" <th>ValidTimeNotation</th>\n",
" <th>ValidTimeNotationRatio</th>\n",
" </tr>\n",
" <tr>\n",
" <th>CATEGORY</th>\n",
" <th>DATATYPE</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">html-rdfa</th>\n",
" <th>http://www.w3.org/2001/XMLSchema#dateTime</th>\n",
" <td>19355530</td>\n",
" <td>0.05</td>\n",
" <td>27505</td>\n",
" <td>0.00</td>\n",
" <td>19243315</td>\n",
" <td>0.99</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#time</th>\n",
" <td>32406</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>21334</td>\n",
" <td>0.66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#date</th>\n",
" <td>21835</td>\n",
" <td>0.00</td>\n",
" <td>16811</td>\n",
" <td>0.77</td>\n",
" <td>1482</td>\n",
" <td>0.07</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#duration</th>\n",
" <td>2434</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYear</th>\n",
" <td>37</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"8\" valign=\"top\">html-embedded-jsonld</th>\n",
" <th>http://schema.org/Date</th>\n",
" <td>1384640759</td>\n",
" <td>0.08</td>\n",
" <td>198639550</td>\n",
" <td>0.14</td>\n",
" <td>991794406</td>\n",
" <td>0.72</td>\n",
" <td>2585</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://schema.org/DateTime</th>\n",
" <td>24827352</td>\n",
" <td>0.00</td>\n",
" <td>4377660</td>\n",
" <td>0.18</td>\n",
" <td>12094933</td>\n",
" <td>0.49</td>\n",
" <td>52711</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#dateTime</th>\n",
" <td>40916</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>40105</td>\n",
" <td>0.98</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#date</th>\n",
" <td>7242</td>\n",
" <td>0.00</td>\n",
" <td>7235</td>\n",
" <td>1.00</td>\n",
" <td>1</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#time</th>\n",
" <td>2254</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>2254</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYear</th>\n",
" <td>528</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#gYearMonth</th>\n",
" <td>95</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://www.w3.org/2001/XMLSchema#duration</th>\n",
" <td>14</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"lexicalRepresentationOfTemporals(dataCleaned2021,usageByCategory2021)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment