Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save rafaelpezzuto/dbc0aa2dbe31161b0cf1e4b605c131ad to your computer and use it in GitHub Desktop.
Save rafaelpezzuto/dbc0aa2dbe31161b0cf1e4b605c131ad to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"from lxml import etree\n",
"import pandas as pd\n",
"import concurrent.futures"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"XMLS_BASEDIR = \"/Users/gustavofonseca/.scielo-kerneldump\""
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def iter_files(base_dir):\n",
" for root, dirs, files in os.walk(base_dir):\n",
" for file in files:\n",
" yield os.path.join(root, file)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"def _toint(value, default=-1):\n",
" try:\n",
" return int(value)\n",
" except:\n",
" return default"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.49 s, sys: 3.3 s, total: 4.79 s\n",
"Wall time: 6.1 s\n"
]
},
{
"data": {
"text/plain": [
"83186"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"xml_files = list(iter_files(XMLS_BASEDIR))\n",
"len(xml_files)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"def get_first_text(xml, expr):\n",
" try:\n",
" first_occ = xml.xpath(expr)[0]\n",
" except IndexError:\n",
" return None\n",
" \n",
" try:\n",
" text = first_occ.text\n",
" except AttributeError:\n",
" text = first_occ\n",
" \n",
" try:\n",
" return text.strip()\n",
" except AttributeError:\n",
" return text\n",
"\n",
"\n",
"def describe(xml_filename):\n",
" xml = etree.parse(xml_filename)\n",
" return {\n",
" \"total_docs\": 1,\n",
" \"filename\": xml_filename,\n",
" \"article_type\": get_first_text(xml, '/article/@article-type'),\n",
" \"doi\": get_first_text(xml, '/article/front/article-meta/article-id[@pub-id-type=\"doi\"]'),\n",
" \"month\": _toint(\n",
" get_first_text(\n",
" xml, \n",
" \"/article/front/article-meta/pub-date[@pub-type = 'epub' or @pub-type = 'epub-ppub' or @date-type = 'pub']/month\",\n",
" ),\n",
" default=0\n",
" ),\n",
" \"year\": _toint(\n",
" get_first_text(\n",
" xml, \n",
" \"/article/front/article-meta/pub-date[@pub-type = 'epub' or @pub-type = 'epub-ppub' or @date-type = 'pub']/year\"\n",
" )\n",
" ),\n",
" \"table_wrap_count\": len(xml.xpath(\"/article/body//table-wrap\")),\n",
" \"table_wrap_table_count\": len(xml.xpath(\"/article/body//table-wrap/table\")),\n",
" \"table_wrap_graphic_count\": len(xml.xpath(\"/article/body//table-wrap/graphic\")),\n",
" \"table_wrap_label_count\": len(xml.xpath(\"/article/body//table-wrap/label\")),\n",
" \"table_wrap_caption_title_count\": len(xml.xpath(\"/article/body//table-wrap/caption/title\")),\n",
" \"fig_count\": len(xml.xpath(\"/article/body//fig\")),\n",
" \"fig_label_count\": len(xml.xpath(\"/article/body//fig/label\")),\n",
" \"fig_caption_title_count\": len(xml.xpath(\"/article/body//fig/caption/title\")),\n",
" \"has_subarticle_translation\": bool(xml.xpath(\"/article/sub-article[@article-type = 'translation']\")),\n",
" \"subarticle_translation_table_wrap_count\": len(xml.xpath(\"/article/sub-article[@article-type = 'translation']/body//table-wrap\")),\n",
" \"subarticle_translation_table_wrap_label_count\": len(xml.xpath(\"/article/sub-article[@article-type = 'translation']/body//table-wrap/label\")),\n",
" \"subarticle_translation_table_wrap_caption_title_count\": len(xml.xpath(\"/article/sub-article[@article-type = 'translation']/body//table-wrap/caption/title\")),\n",
" \"subarticle_translation_fig_count\": len(xml.xpath(\"/article/sub-article[@article-type = 'translation']/body//fig\")),\n",
" \"subarticle_translation_fig_label_count\": len(xml.xpath(\"/article/sub-article[@article-type = 'translation']/body//fig/label\")),\n",
" \"subarticle_translation_fig_caption_title_count\": len(xml.xpath(\"/article/sub-article[@article-type = 'translation']/body//fig/caption/title\")),\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'article_type': 'research-article',\n",
" 'doi': '10.21577/0103-5053.20160329',\n",
" 'fig_caption_title_count': 4,\n",
" 'fig_count': 4,\n",
" 'fig_label_count': 4,\n",
" 'filename': '/Users/gustavofonseca/.scielo-kerneldump/kernel/2017/Yk6g/Yk6g3tKzqScFC8BbD8jmXbR.xml',\n",
" 'has_subarticle_translation': False,\n",
" 'month': 9,\n",
" 'subarticle_translation_fig_caption_title_count': 0,\n",
" 'subarticle_translation_fig_count': 0,\n",
" 'subarticle_translation_fig_label_count': 0,\n",
" 'subarticle_translation_table_wrap_caption_title_count': 0,\n",
" 'subarticle_translation_table_wrap_count': 0,\n",
" 'subarticle_translation_table_wrap_label_count': 0,\n",
" 'table_wrap_caption_title_count': 2,\n",
" 'table_wrap_count': 2,\n",
" 'table_wrap_graphic_count': 0,\n",
" 'table_wrap_label_count': 2,\n",
" 'table_wrap_table_count': 2,\n",
" 'total_docs': 1,\n",
" 'year': 2017}\n"
]
}
],
"source": [
"from pprint import pprint as pp\n",
"pp(describe(xml_files[0]))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 11min 16s, sys: 1min 19s, total: 12min 36s\n",
"Wall time: 4min 3s\n"
]
}
],
"source": [
"%%time\n",
"with concurrent.futures.ThreadPoolExecutor() as executor:\n",
" with open(\"descriptions2.jsonl\", \"w\") as output:\n",
" for filepath, description in zip(xml_files, executor.map(describe, xml_files)):\n",
" output.write(json.dumps(description)+\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"83186"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with open(\"descriptions2.jsonl\") as input:\n",
" descriptions = [json.loads(line) for line in input]\n",
"len(descriptions)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(82723, 21)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"descriptions_gte_2017 = [d for d in descriptions if _toint(d[\"year\"]) >= 2017]\n",
"df = pd.DataFrame.from_records(descriptions_gte_2017)\n",
"df.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"O que queremos saber:\n",
"1. Qual o percentual de tabelas codificadas em HTML?\n",
"2. Qual o percentual de tabelas que possuem *label*?\n",
"3. Qual o percentual de tabelas que possuem *caption*?\n",
"4. Qual o percentual de figuras que possuem *label*?\n",
"5. Qual o percentual de figuras que possuem *caption*?"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>total_docs</th>\n",
" <th>table_wrap_count</th>\n",
" <th>table_wrap_table_count</th>\n",
" <th>table_wrap_graphic_count</th>\n",
" <th>table_wrap_label_count</th>\n",
" <th>table_wrap_caption_title_count</th>\n",
" <th>fig_count</th>\n",
" <th>fig_label_count</th>\n",
" <th>fig_caption_title_count</th>\n",
" <th>has_subarticle_translation</th>\n",
" <th>...</th>\n",
" <th>subarticle_translation_table_wrap_label_count</th>\n",
" <th>subarticle_translation_table_wrap_caption_title_count</th>\n",
" <th>subarticle_translation_fig_count</th>\n",
" <th>subarticle_translation_fig_label_count</th>\n",
" <th>subarticle_translation_fig_caption_title_count</th>\n",
" <th>percentage_html_tables</th>\n",
" <th>percentage_tables_with_labels</th>\n",
" <th>percentage_tables_with_captions_titles</th>\n",
" <th>percentage_fig_with_labels</th>\n",
" <th>percentage_fig_with_captions_titles</th>\n",
" </tr>\n",
" <tr>\n",
" <th>year</th>\n",
" <th>month</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"13\" valign=\"top\">2017</th>\n",
" <th>0</th>\n",
" <td>10937</td>\n",
" <td>21465</td>\n",
" <td>18827</td>\n",
" <td>2009</td>\n",
" <td>20818</td>\n",
" <td>20936</td>\n",
" <td>24384</td>\n",
" <td>23942</td>\n",
" <td>23923</td>\n",
" <td>2605.0</td>\n",
" <td>...</td>\n",
" <td>4884</td>\n",
" <td>4945</td>\n",
" <td>3705</td>\n",
" <td>3669</td>\n",
" <td>3672</td>\n",
" <td>87.7102</td>\n",
" <td>96.9858</td>\n",
" <td>97.5355</td>\n",
" <td>98.187336</td>\n",
" <td>98.109416</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>539</td>\n",
" <td>1135</td>\n",
" <td>978</td>\n",
" <td>10</td>\n",
" <td>1103</td>\n",
" <td>1103</td>\n",
" <td>1507</td>\n",
" <td>1495</td>\n",
" <td>1493</td>\n",
" <td>111.0</td>\n",
" <td>...</td>\n",
" <td>209</td>\n",
" <td>209</td>\n",
" <td>131</td>\n",
" <td>129</td>\n",
" <td>129</td>\n",
" <td>86.1674</td>\n",
" <td>97.1806</td>\n",
" <td>97.1806</td>\n",
" <td>99.203716</td>\n",
" <td>99.071002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>586</td>\n",
" <td>1440</td>\n",
" <td>1227</td>\n",
" <td>164</td>\n",
" <td>1415</td>\n",
" <td>1414</td>\n",
" <td>1564</td>\n",
" <td>1547</td>\n",
" <td>1550</td>\n",
" <td>107.0</td>\n",
" <td>...</td>\n",
" <td>277</td>\n",
" <td>277</td>\n",
" <td>225</td>\n",
" <td>220</td>\n",
" <td>220</td>\n",
" <td>85.2083</td>\n",
" <td>98.2639</td>\n",
" <td>98.1944</td>\n",
" <td>98.913043</td>\n",
" <td>99.104859</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1083</td>\n",
" <td>2569</td>\n",
" <td>2026</td>\n",
" <td>405</td>\n",
" <td>2549</td>\n",
" <td>2548</td>\n",
" <td>3106</td>\n",
" <td>3098</td>\n",
" <td>3102</td>\n",
" <td>196.0</td>\n",
" <td>...</td>\n",
" <td>489</td>\n",
" <td>489</td>\n",
" <td>280</td>\n",
" <td>280</td>\n",
" <td>280</td>\n",
" <td>78.8634</td>\n",
" <td>99.2215</td>\n",
" <td>99.1826</td>\n",
" <td>99.742434</td>\n",
" <td>99.871217</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1043</td>\n",
" <td>2219</td>\n",
" <td>1773</td>\n",
" <td>361</td>\n",
" <td>2211</td>\n",
" <td>2211</td>\n",
" <td>2502</td>\n",
" <td>2456</td>\n",
" <td>2464</td>\n",
" <td>178.0</td>\n",
" <td>...</td>\n",
" <td>441</td>\n",
" <td>441</td>\n",
" <td>367</td>\n",
" <td>367</td>\n",
" <td>367</td>\n",
" <td>79.9009</td>\n",
" <td>99.6395</td>\n",
" <td>99.6395</td>\n",
" <td>98.161471</td>\n",
" <td>98.481215</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>960</td>\n",
" <td>2156</td>\n",
" <td>1759</td>\n",
" <td>310</td>\n",
" <td>2150</td>\n",
" <td>2145</td>\n",
" <td>2418</td>\n",
" <td>2400</td>\n",
" <td>2401</td>\n",
" <td>140.0</td>\n",
" <td>...</td>\n",
" <td>360</td>\n",
" <td>360</td>\n",
" <td>184</td>\n",
" <td>184</td>\n",
" <td>182</td>\n",
" <td>81.5863</td>\n",
" <td>99.7217</td>\n",
" <td>99.4898</td>\n",
" <td>99.255583</td>\n",
" <td>99.296940</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1144</td>\n",
" <td>2741</td>\n",
" <td>2253</td>\n",
" <td>468</td>\n",
" <td>2709</td>\n",
" <td>2712</td>\n",
" <td>3560</td>\n",
" <td>3540</td>\n",
" <td>3515</td>\n",
" <td>240.0</td>\n",
" <td>...</td>\n",
" <td>562</td>\n",
" <td>565</td>\n",
" <td>340</td>\n",
" <td>338</td>\n",
" <td>337</td>\n",
" <td>82.1963</td>\n",
" <td>98.8325</td>\n",
" <td>98.942</td>\n",
" <td>99.438202</td>\n",
" <td>98.735955</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>930</td>\n",
" <td>2067</td>\n",
" <td>1641</td>\n",
" <td>351</td>\n",
" <td>2064</td>\n",
" <td>2064</td>\n",
" <td>2434</td>\n",
" <td>2420</td>\n",
" <td>2421</td>\n",
" <td>199.0</td>\n",
" <td>...</td>\n",
" <td>389</td>\n",
" <td>388</td>\n",
" <td>206</td>\n",
" <td>204</td>\n",
" <td>205</td>\n",
" <td>79.3904</td>\n",
" <td>99.8549</td>\n",
" <td>99.8549</td>\n",
" <td>99.424815</td>\n",
" <td>99.465900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1187</td>\n",
" <td>2573</td>\n",
" <td>2186</td>\n",
" <td>266</td>\n",
" <td>2536</td>\n",
" <td>2536</td>\n",
" <td>3127</td>\n",
" <td>3110</td>\n",
" <td>3108</td>\n",
" <td>178.0</td>\n",
" <td>...</td>\n",
" <td>343</td>\n",
" <td>342</td>\n",
" <td>265</td>\n",
" <td>265</td>\n",
" <td>265</td>\n",
" <td>84.9592</td>\n",
" <td>98.562</td>\n",
" <td>98.562</td>\n",
" <td>99.456348</td>\n",
" <td>99.392389</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>827</td>\n",
" <td>1929</td>\n",
" <td>1644</td>\n",
" <td>250</td>\n",
" <td>1928</td>\n",
" <td>1928</td>\n",
" <td>2548</td>\n",
" <td>2523</td>\n",
" <td>2520</td>\n",
" <td>209.0</td>\n",
" <td>...</td>\n",
" <td>486</td>\n",
" <td>486</td>\n",
" <td>351</td>\n",
" <td>346</td>\n",
" <td>346</td>\n",
" <td>85.2255</td>\n",
" <td>99.9482</td>\n",
" <td>99.9482</td>\n",
" <td>99.018838</td>\n",
" <td>98.901099</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>892</td>\n",
" <td>2119</td>\n",
" <td>1999</td>\n",
" <td>82</td>\n",
" <td>2115</td>\n",
" <td>2115</td>\n",
" <td>2779</td>\n",
" <td>2772</td>\n",
" <td>2759</td>\n",
" <td>162.0</td>\n",
" <td>...</td>\n",
" <td>365</td>\n",
" <td>365</td>\n",
" <td>257</td>\n",
" <td>256</td>\n",
" <td>250</td>\n",
" <td>94.337</td>\n",
" <td>99.8112</td>\n",
" <td>99.8112</td>\n",
" <td>99.748111</td>\n",
" <td>99.280317</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>810</td>\n",
" <td>1932</td>\n",
" <td>1834</td>\n",
" <td>1</td>\n",
" <td>1928</td>\n",
" <td>1928</td>\n",
" <td>2154</td>\n",
" <td>2152</td>\n",
" <td>2152</td>\n",
" <td>166.0</td>\n",
" <td>...</td>\n",
" <td>417</td>\n",
" <td>417</td>\n",
" <td>275</td>\n",
" <td>273</td>\n",
" <td>273</td>\n",
" <td>94.9275</td>\n",
" <td>99.793</td>\n",
" <td>99.793</td>\n",
" <td>99.907149</td>\n",
" <td>99.907149</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1397</td>\n",
" <td>2618</td>\n",
" <td>2557</td>\n",
" <td>11</td>\n",
" <td>2599</td>\n",
" <td>2599</td>\n",
" <td>3353</td>\n",
" <td>3320</td>\n",
" <td>3318</td>\n",
" <td>208.0</td>\n",
" <td>...</td>\n",
" <td>440</td>\n",
" <td>439</td>\n",
" <td>327</td>\n",
" <td>314</td>\n",
" <td>315</td>\n",
" <td>97.67</td>\n",
" <td>99.2743</td>\n",
" <td>99.2743</td>\n",
" <td>99.015807</td>\n",
" <td>98.956159</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"13\" valign=\"top\">2018</th>\n",
" <th>0</th>\n",
" <td>10759</td>\n",
" <td>21612</td>\n",
" <td>20660</td>\n",
" <td>16</td>\n",
" <td>21020</td>\n",
" <td>21018</td>\n",
" <td>23669</td>\n",
" <td>23128</td>\n",
" <td>23114</td>\n",
" <td>2420.0</td>\n",
" <td>...</td>\n",
" <td>4297</td>\n",
" <td>4306</td>\n",
" <td>3010</td>\n",
" <td>2995</td>\n",
" <td>3000</td>\n",
" <td>95.595</td>\n",
" <td>97.2608</td>\n",
" <td>97.2515</td>\n",
" <td>97.714310</td>\n",
" <td>97.655161</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>709</td>\n",
" <td>1320</td>\n",
" <td>1321</td>\n",
" <td>0</td>\n",
" <td>1303</td>\n",
" <td>1305</td>\n",
" <td>2124</td>\n",
" <td>2118</td>\n",
" <td>2115</td>\n",
" <td>115.0</td>\n",
" <td>...</td>\n",
" <td>199</td>\n",
" <td>199</td>\n",
" <td>178</td>\n",
" <td>178</td>\n",
" <td>178</td>\n",
" <td>100.076</td>\n",
" <td>98.7121</td>\n",
" <td>98.8636</td>\n",
" <td>99.717514</td>\n",
" <td>99.576271</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>863</td>\n",
" <td>1985</td>\n",
" <td>1959</td>\n",
" <td>0</td>\n",
" <td>1983</td>\n",
" <td>1983</td>\n",
" <td>2463</td>\n",
" <td>2456</td>\n",
" <td>2459</td>\n",
" <td>155.0</td>\n",
" <td>...</td>\n",
" <td>396</td>\n",
" <td>396</td>\n",
" <td>357</td>\n",
" <td>357</td>\n",
" <td>357</td>\n",
" <td>98.6902</td>\n",
" <td>99.8992</td>\n",
" <td>99.8992</td>\n",
" <td>99.715794</td>\n",
" <td>99.837596</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1089</td>\n",
" <td>2468</td>\n",
" <td>2449</td>\n",
" <td>0</td>\n",
" <td>2461</td>\n",
" <td>2460</td>\n",
" <td>3025</td>\n",
" <td>3014</td>\n",
" <td>3013</td>\n",
" <td>204.0</td>\n",
" <td>...</td>\n",
" <td>411</td>\n",
" <td>411</td>\n",
" <td>260</td>\n",
" <td>258</td>\n",
" <td>258</td>\n",
" <td>99.2301</td>\n",
" <td>99.7164</td>\n",
" <td>99.6759</td>\n",
" <td>99.636364</td>\n",
" <td>99.603306</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>988</td>\n",
" <td>1933</td>\n",
" <td>1918</td>\n",
" <td>1</td>\n",
" <td>1925</td>\n",
" <td>1928</td>\n",
" <td>2483</td>\n",
" <td>2451</td>\n",
" <td>2455</td>\n",
" <td>172.0</td>\n",
" <td>...</td>\n",
" <td>358</td>\n",
" <td>359</td>\n",
" <td>234</td>\n",
" <td>234</td>\n",
" <td>234</td>\n",
" <td>99.224</td>\n",
" <td>99.5861</td>\n",
" <td>99.7413</td>\n",
" <td>98.711236</td>\n",
" <td>98.872332</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>975</td>\n",
" <td>2133</td>\n",
" <td>2117</td>\n",
" <td>0</td>\n",
" <td>2128</td>\n",
" <td>2128</td>\n",
" <td>2857</td>\n",
" <td>2831</td>\n",
" <td>2828</td>\n",
" <td>236.0</td>\n",
" <td>...</td>\n",
" <td>560</td>\n",
" <td>560</td>\n",
" <td>403</td>\n",
" <td>396</td>\n",
" <td>376</td>\n",
" <td>99.2499</td>\n",
" <td>99.7656</td>\n",
" <td>99.7656</td>\n",
" <td>99.089954</td>\n",
" <td>98.984949</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1155</td>\n",
" <td>2368</td>\n",
" <td>2338</td>\n",
" <td>0</td>\n",
" <td>2358</td>\n",
" <td>2358</td>\n",
" <td>3235</td>\n",
" <td>3231</td>\n",
" <td>3217</td>\n",
" <td>289.0</td>\n",
" <td>...</td>\n",
" <td>542</td>\n",
" <td>541</td>\n",
" <td>421</td>\n",
" <td>419</td>\n",
" <td>419</td>\n",
" <td>98.7331</td>\n",
" <td>99.5777</td>\n",
" <td>99.5777</td>\n",
" <td>99.876352</td>\n",
" <td>99.443586</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1230</td>\n",
" <td>2911</td>\n",
" <td>2901</td>\n",
" <td>0</td>\n",
" <td>2903</td>\n",
" <td>2910</td>\n",
" <td>4593</td>\n",
" <td>4585</td>\n",
" <td>4585</td>\n",
" <td>259.0</td>\n",
" <td>...</td>\n",
" <td>574</td>\n",
" <td>581</td>\n",
" <td>365</td>\n",
" <td>365</td>\n",
" <td>365</td>\n",
" <td>99.6565</td>\n",
" <td>99.7252</td>\n",
" <td>99.9656</td>\n",
" <td>99.825822</td>\n",
" <td>99.825822</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1114</td>\n",
" <td>2144</td>\n",
" <td>2107</td>\n",
" <td>0</td>\n",
" <td>2126</td>\n",
" <td>2125</td>\n",
" <td>2814</td>\n",
" <td>2802</td>\n",
" <td>2791</td>\n",
" <td>266.0</td>\n",
" <td>...</td>\n",
" <td>540</td>\n",
" <td>540</td>\n",
" <td>415</td>\n",
" <td>410</td>\n",
" <td>414</td>\n",
" <td>98.2743</td>\n",
" <td>99.1604</td>\n",
" <td>99.1138</td>\n",
" <td>99.573561</td>\n",
" <td>99.182658</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>788</td>\n",
" <td>1836</td>\n",
" <td>1815</td>\n",
" <td>0</td>\n",
" <td>1765</td>\n",
" <td>1768</td>\n",
" <td>1918</td>\n",
" <td>1913</td>\n",
" <td>1908</td>\n",
" <td>192.0</td>\n",
" <td>...</td>\n",
" <td>394</td>\n",
" <td>394</td>\n",
" <td>252</td>\n",
" <td>252</td>\n",
" <td>252</td>\n",
" <td>98.8562</td>\n",
" <td>96.1329</td>\n",
" <td>96.2963</td>\n",
" <td>99.739312</td>\n",
" <td>99.478624</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1302</td>\n",
" <td>3186</td>\n",
" <td>3172</td>\n",
" <td>0</td>\n",
" <td>3178</td>\n",
" <td>3178</td>\n",
" <td>4046</td>\n",
" <td>4042</td>\n",
" <td>4020</td>\n",
" <td>222.0</td>\n",
" <td>...</td>\n",
" <td>519</td>\n",
" <td>519</td>\n",
" <td>287</td>\n",
" <td>283</td>\n",
" <td>283</td>\n",
" <td>99.5606</td>\n",
" <td>99.7489</td>\n",
" <td>99.7489</td>\n",
" <td>99.901137</td>\n",
" <td>99.357390</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>1256</td>\n",
" <td>2710</td>\n",
" <td>2662</td>\n",
" <td>4</td>\n",
" <td>2702</td>\n",
" <td>2703</td>\n",
" <td>3309</td>\n",
" <td>3282</td>\n",
" <td>3294</td>\n",
" <td>276.0</td>\n",
" <td>...</td>\n",
" <td>661</td>\n",
" <td>660</td>\n",
" <td>414</td>\n",
" <td>407</td>\n",
" <td>408</td>\n",
" <td>98.2288</td>\n",
" <td>99.7048</td>\n",
" <td>99.7417</td>\n",
" <td>99.184044</td>\n",
" <td>99.546691</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1044</td>\n",
" <td>2125</td>\n",
" <td>2125</td>\n",
" <td>0</td>\n",
" <td>2099</td>\n",
" <td>2101</td>\n",
" <td>2925</td>\n",
" <td>2906</td>\n",
" <td>2903</td>\n",
" <td>196.0</td>\n",
" <td>...</td>\n",
" <td>454</td>\n",
" <td>454</td>\n",
" <td>388</td>\n",
" <td>387</td>\n",
" <td>388</td>\n",
" <td>100</td>\n",
" <td>98.7765</td>\n",
" <td>98.8706</td>\n",
" <td>99.350427</td>\n",
" <td>99.247863</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"14\" valign=\"top\">2019</th>\n",
" <th>0</th>\n",
" <td>1364</td>\n",
" <td>2761</td>\n",
" <td>2701</td>\n",
" <td>0</td>\n",
" <td>2672</td>\n",
" <td>2672</td>\n",
" <td>3199</td>\n",
" <td>3162</td>\n",
" <td>3165</td>\n",
" <td>325.0</td>\n",
" <td>...</td>\n",
" <td>639</td>\n",
" <td>639</td>\n",
" <td>433</td>\n",
" <td>432</td>\n",
" <td>432</td>\n",
" <td>97.8269</td>\n",
" <td>96.7765</td>\n",
" <td>96.7765</td>\n",
" <td>98.843389</td>\n",
" <td>98.937168</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>800</td>\n",
" <td>1604</td>\n",
" <td>1600</td>\n",
" <td>0</td>\n",
" <td>1560</td>\n",
" <td>1561</td>\n",
" <td>1899</td>\n",
" <td>1887</td>\n",
" <td>1861</td>\n",
" <td>249.0</td>\n",
" <td>...</td>\n",
" <td>487</td>\n",
" <td>488</td>\n",
" <td>335</td>\n",
" <td>333</td>\n",
" <td>333</td>\n",
" <td>99.7506</td>\n",
" <td>97.2569</td>\n",
" <td>97.3192</td>\n",
" <td>99.368088</td>\n",
" <td>97.998947</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1086</td>\n",
" <td>2332</td>\n",
" <td>2308</td>\n",
" <td>0</td>\n",
" <td>2324</td>\n",
" <td>2325</td>\n",
" <td>2430</td>\n",
" <td>2423</td>\n",
" <td>2417</td>\n",
" <td>296.0</td>\n",
" <td>...</td>\n",
" <td>630</td>\n",
" <td>630</td>\n",
" <td>397</td>\n",
" <td>395</td>\n",
" <td>395</td>\n",
" <td>98.9708</td>\n",
" <td>99.6569</td>\n",
" <td>99.6998</td>\n",
" <td>99.711934</td>\n",
" <td>99.465021</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>830</td>\n",
" <td>1691</td>\n",
" <td>1694</td>\n",
" <td>0</td>\n",
" <td>1685</td>\n",
" <td>1685</td>\n",
" <td>2353</td>\n",
" <td>2301</td>\n",
" <td>2306</td>\n",
" <td>223.0</td>\n",
" <td>...</td>\n",
" <td>500</td>\n",
" <td>500</td>\n",
" <td>321</td>\n",
" <td>320</td>\n",
" <td>320</td>\n",
" <td>100.177</td>\n",
" <td>99.6452</td>\n",
" <td>99.6452</td>\n",
" <td>97.790055</td>\n",
" <td>98.002550</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1853</td>\n",
" <td>3676</td>\n",
" <td>3682</td>\n",
" <td>0</td>\n",
" <td>3632</td>\n",
" <td>3640</td>\n",
" <td>4342</td>\n",
" <td>4318</td>\n",
" <td>4299</td>\n",
" <td>430.0</td>\n",
" <td>...</td>\n",
" <td>839</td>\n",
" <td>839</td>\n",
" <td>574</td>\n",
" <td>566</td>\n",
" <td>561</td>\n",
" <td>100.163</td>\n",
" <td>98.803</td>\n",
" <td>99.0207</td>\n",
" <td>99.447259</td>\n",
" <td>99.009673</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2023</td>\n",
" <td>4339</td>\n",
" <td>4211</td>\n",
" <td>0</td>\n",
" <td>4270</td>\n",
" <td>4274</td>\n",
" <td>4819</td>\n",
" <td>4769</td>\n",
" <td>4773</td>\n",
" <td>393.0</td>\n",
" <td>...</td>\n",
" <td>770</td>\n",
" <td>772</td>\n",
" <td>654</td>\n",
" <td>650</td>\n",
" <td>649</td>\n",
" <td>97.05</td>\n",
" <td>98.4098</td>\n",
" <td>98.502</td>\n",
" <td>98.962440</td>\n",
" <td>99.045445</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1593</td>\n",
" <td>3705</td>\n",
" <td>3621</td>\n",
" <td>0</td>\n",
" <td>3632</td>\n",
" <td>3618</td>\n",
" <td>4458</td>\n",
" <td>4436</td>\n",
" <td>4433</td>\n",
" <td>370.0</td>\n",
" <td>...</td>\n",
" <td>694</td>\n",
" <td>694</td>\n",
" <td>506</td>\n",
" <td>505</td>\n",
" <td>506</td>\n",
" <td>97.7328</td>\n",
" <td>98.0297</td>\n",
" <td>97.6518</td>\n",
" <td>99.506505</td>\n",
" <td>99.439210</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1969</td>\n",
" <td>4328</td>\n",
" <td>4327</td>\n",
" <td>0</td>\n",
" <td>4238</td>\n",
" <td>4243</td>\n",
" <td>4547</td>\n",
" <td>4511</td>\n",
" <td>4541</td>\n",
" <td>448.0</td>\n",
" <td>...</td>\n",
" <td>960</td>\n",
" <td>962</td>\n",
" <td>767</td>\n",
" <td>753</td>\n",
" <td>754</td>\n",
" <td>99.9769</td>\n",
" <td>97.9205</td>\n",
" <td>98.036</td>\n",
" <td>99.208269</td>\n",
" <td>99.868045</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1888</td>\n",
" <td>4128</td>\n",
" <td>4038</td>\n",
" <td>0</td>\n",
" <td>4052</td>\n",
" <td>4043</td>\n",
" <td>4706</td>\n",
" <td>4627</td>\n",
" <td>4668</td>\n",
" <td>497.0</td>\n",
" <td>...</td>\n",
" <td>933</td>\n",
" <td>934</td>\n",
" <td>790</td>\n",
" <td>785</td>\n",
" <td>785</td>\n",
" <td>97.8198</td>\n",
" <td>98.1589</td>\n",
" <td>97.9409</td>\n",
" <td>98.321292</td>\n",
" <td>99.192520</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>2314</td>\n",
" <td>4947</td>\n",
" <td>4794</td>\n",
" <td>0</td>\n",
" <td>4913</td>\n",
" <td>4915</td>\n",
" <td>5964</td>\n",
" <td>5903</td>\n",
" <td>5905</td>\n",
" <td>455.0</td>\n",
" <td>...</td>\n",
" <td>939</td>\n",
" <td>939</td>\n",
" <td>704</td>\n",
" <td>701</td>\n",
" <td>702</td>\n",
" <td>96.9072</td>\n",
" <td>99.3127</td>\n",
" <td>99.3531</td>\n",
" <td>98.977197</td>\n",
" <td>99.010731</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>2197</td>\n",
" <td>4833</td>\n",
" <td>4824</td>\n",
" <td>0</td>\n",
" <td>4817</td>\n",
" <td>4814</td>\n",
" <td>5061</td>\n",
" <td>5040</td>\n",
" <td>5037</td>\n",
" <td>517.0</td>\n",
" <td>...</td>\n",
" <td>1158</td>\n",
" <td>1158</td>\n",
" <td>573</td>\n",
" <td>569</td>\n",
" <td>569</td>\n",
" <td>99.8138</td>\n",
" <td>99.6689</td>\n",
" <td>99.6069</td>\n",
" <td>99.585062</td>\n",
" <td>99.525785</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>2203</td>\n",
" <td>4899</td>\n",
" <td>4794</td>\n",
" <td>0</td>\n",
" <td>4800</td>\n",
" <td>4800</td>\n",
" <td>5618</td>\n",
" <td>5540</td>\n",
" <td>5544</td>\n",
" <td>448.0</td>\n",
" <td>...</td>\n",
" <td>913</td>\n",
" <td>911</td>\n",
" <td>748</td>\n",
" <td>739</td>\n",
" <td>743</td>\n",
" <td>97.8567</td>\n",
" <td>97.9792</td>\n",
" <td>97.9792</td>\n",
" <td>98.611606</td>\n",
" <td>98.682805</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>2498</td>\n",
" <td>5644</td>\n",
" <td>5475</td>\n",
" <td>0</td>\n",
" <td>5369</td>\n",
" <td>5095</td>\n",
" <td>5992</td>\n",
" <td>5947</td>\n",
" <td>5943</td>\n",
" <td>699.0</td>\n",
" <td>...</td>\n",
" <td>1350</td>\n",
" <td>1351</td>\n",
" <td>1195</td>\n",
" <td>1191</td>\n",
" <td>1190</td>\n",
" <td>97.0057</td>\n",
" <td>95.1276</td>\n",
" <td>90.2729</td>\n",
" <td>99.248999</td>\n",
" <td>99.182243</td>\n",
" </tr>\n",
" <tr>\n",
" <th>109</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>100.000000</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"9\" valign=\"top\">2020</th>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>8</td>\n",
" <td>8</td>\n",
" <td>8</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>100</td>\n",
" <td>100</td>\n",
" <td>100</td>\n",
" <td>100.000000</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1336</td>\n",
" <td>2606</td>\n",
" <td>2629</td>\n",
" <td>0</td>\n",
" <td>2539</td>\n",
" <td>2541</td>\n",
" <td>3454</td>\n",
" <td>3350</td>\n",
" <td>3316</td>\n",
" <td>312.0</td>\n",
" <td>...</td>\n",
" <td>633</td>\n",
" <td>633</td>\n",
" <td>422</td>\n",
" <td>421</td>\n",
" <td>421</td>\n",
" <td>100.883</td>\n",
" <td>97.429</td>\n",
" <td>97.5058</td>\n",
" <td>96.988998</td>\n",
" <td>96.004632</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1267</td>\n",
" <td>2717</td>\n",
" <td>2620</td>\n",
" <td>0</td>\n",
" <td>2698</td>\n",
" <td>2697</td>\n",
" <td>2689</td>\n",
" <td>2668</td>\n",
" <td>2667</td>\n",
" <td>334.0</td>\n",
" <td>...</td>\n",
" <td>683</td>\n",
" <td>683</td>\n",
" <td>389</td>\n",
" <td>387</td>\n",
" <td>387</td>\n",
" <td>96.4299</td>\n",
" <td>99.3007</td>\n",
" <td>99.2639</td>\n",
" <td>99.219041</td>\n",
" <td>99.181852</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1849</td>\n",
" <td>3736</td>\n",
" <td>3726</td>\n",
" <td>0</td>\n",
" <td>3689</td>\n",
" <td>3665</td>\n",
" <td>4318</td>\n",
" <td>4281</td>\n",
" <td>4292</td>\n",
" <td>525.0</td>\n",
" <td>...</td>\n",
" <td>1047</td>\n",
" <td>1047</td>\n",
" <td>812</td>\n",
" <td>811</td>\n",
" <td>811</td>\n",
" <td>99.7323</td>\n",
" <td>98.742</td>\n",
" <td>98.0996</td>\n",
" <td>99.143122</td>\n",
" <td>99.397869</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1691</td>\n",
" <td>3602</td>\n",
" <td>3603</td>\n",
" <td>0</td>\n",
" <td>3530</td>\n",
" <td>3530</td>\n",
" <td>4686</td>\n",
" <td>4625</td>\n",
" <td>4570</td>\n",
" <td>366.0</td>\n",
" <td>...</td>\n",
" <td>592</td>\n",
" <td>592</td>\n",
" <td>521</td>\n",
" <td>485</td>\n",
" <td>452</td>\n",
" <td>100.028</td>\n",
" <td>98.0011</td>\n",
" <td>98.0011</td>\n",
" <td>98.698250</td>\n",
" <td>97.524541</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1938</td>\n",
" <td>3947</td>\n",
" <td>3834</td>\n",
" <td>0</td>\n",
" <td>3904</td>\n",
" <td>3889</td>\n",
" <td>4681</td>\n",
" <td>4655</td>\n",
" <td>4642</td>\n",
" <td>506.0</td>\n",
" <td>...</td>\n",
" <td>927</td>\n",
" <td>927</td>\n",
" <td>703</td>\n",
" <td>697</td>\n",
" <td>697</td>\n",
" <td>97.1371</td>\n",
" <td>98.9106</td>\n",
" <td>98.5305</td>\n",
" <td>99.444563</td>\n",
" <td>99.166845</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>2360</td>\n",
" <td>4582</td>\n",
" <td>4578</td>\n",
" <td>0</td>\n",
" <td>4551</td>\n",
" <td>4545</td>\n",
" <td>5439</td>\n",
" <td>5390</td>\n",
" <td>5364</td>\n",
" <td>596.0</td>\n",
" <td>...</td>\n",
" <td>1205</td>\n",
" <td>1205</td>\n",
" <td>894</td>\n",
" <td>888</td>\n",
" <td>888</td>\n",
" <td>99.9127</td>\n",
" <td>99.3234</td>\n",
" <td>99.1925</td>\n",
" <td>99.099099</td>\n",
" <td>98.621070</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>2530</td>\n",
" <td>5339</td>\n",
" <td>5198</td>\n",
" <td>0</td>\n",
" <td>5295</td>\n",
" <td>5295</td>\n",
" <td>6698</td>\n",
" <td>6646</td>\n",
" <td>6635</td>\n",
" <td>612.0</td>\n",
" <td>...</td>\n",
" <td>1228</td>\n",
" <td>1229</td>\n",
" <td>787</td>\n",
" <td>783</td>\n",
" <td>784</td>\n",
" <td>97.3591</td>\n",
" <td>99.1759</td>\n",
" <td>99.1759</td>\n",
" <td>99.223649</td>\n",
" <td>99.059421</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1517</td>\n",
" <td>2862</td>\n",
" <td>2863</td>\n",
" <td>0</td>\n",
" <td>2766</td>\n",
" <td>2772</td>\n",
" <td>3659</td>\n",
" <td>3630</td>\n",
" <td>3624</td>\n",
" <td>375.0</td>\n",
" <td>...</td>\n",
" <td>682</td>\n",
" <td>687</td>\n",
" <td>521</td>\n",
" <td>514</td>\n",
" <td>515</td>\n",
" <td>100.035</td>\n",
" <td>96.6457</td>\n",
" <td>96.8553</td>\n",
" <td>99.207434</td>\n",
" <td>99.043454</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>49 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" total_docs table_wrap_count table_wrap_table_count \\\n",
"year month \n",
"2017 0 10937 21465 18827 \n",
" 1 539 1135 978 \n",
" 2 586 1440 1227 \n",
" 3 1083 2569 2026 \n",
" 4 1043 2219 1773 \n",
" 5 960 2156 1759 \n",
" 6 1144 2741 2253 \n",
" 7 930 2067 1641 \n",
" 8 1187 2573 2186 \n",
" 9 827 1929 1644 \n",
" 10 892 2119 1999 \n",
" 11 810 1932 1834 \n",
" 12 1397 2618 2557 \n",
"2018 0 10759 21612 20660 \n",
" 1 709 1320 1321 \n",
" 2 863 1985 1959 \n",
" 3 1089 2468 2449 \n",
" 4 988 1933 1918 \n",
" 5 975 2133 2117 \n",
" 6 1155 2368 2338 \n",
" 7 1230 2911 2901 \n",
" 8 1114 2144 2107 \n",
" 9 788 1836 1815 \n",
" 10 1302 3186 3172 \n",
" 11 1256 2710 2662 \n",
" 12 1044 2125 2125 \n",
"2019 0 1364 2761 2701 \n",
" 1 800 1604 1600 \n",
" 2 1086 2332 2308 \n",
" 3 830 1691 1694 \n",
" 4 1853 3676 3682 \n",
" 5 2023 4339 4211 \n",
" 6 1593 3705 3621 \n",
" 7 1969 4328 4327 \n",
" 8 1888 4128 4038 \n",
" 9 2314 4947 4794 \n",
" 10 2197 4833 4824 \n",
" 11 2203 4899 4794 \n",
" 12 2498 5644 5475 \n",
" 109 1 0 0 \n",
"2020 0 2 4 4 \n",
" 1 1336 2606 2629 \n",
" 2 1267 2717 2620 \n",
" 3 1849 3736 3726 \n",
" 4 1691 3602 3603 \n",
" 5 1938 3947 3834 \n",
" 6 2360 4582 4578 \n",
" 7 2530 5339 5198 \n",
" 8 1517 2862 2863 \n",
"\n",
" table_wrap_graphic_count table_wrap_label_count \\\n",
"year month \n",
"2017 0 2009 20818 \n",
" 1 10 1103 \n",
" 2 164 1415 \n",
" 3 405 2549 \n",
" 4 361 2211 \n",
" 5 310 2150 \n",
" 6 468 2709 \n",
" 7 351 2064 \n",
" 8 266 2536 \n",
" 9 250 1928 \n",
" 10 82 2115 \n",
" 11 1 1928 \n",
" 12 11 2599 \n",
"2018 0 16 21020 \n",
" 1 0 1303 \n",
" 2 0 1983 \n",
" 3 0 2461 \n",
" 4 1 1925 \n",
" 5 0 2128 \n",
" 6 0 2358 \n",
" 7 0 2903 \n",
" 8 0 2126 \n",
" 9 0 1765 \n",
" 10 0 3178 \n",
" 11 4 2702 \n",
" 12 0 2099 \n",
"2019 0 0 2672 \n",
" 1 0 1560 \n",
" 2 0 2324 \n",
" 3 0 1685 \n",
" 4 0 3632 \n",
" 5 0 4270 \n",
" 6 0 3632 \n",
" 7 0 4238 \n",
" 8 0 4052 \n",
" 9 0 4913 \n",
" 10 0 4817 \n",
" 11 0 4800 \n",
" 12 0 5369 \n",
" 109 0 0 \n",
"2020 0 0 4 \n",
" 1 0 2539 \n",
" 2 0 2698 \n",
" 3 0 3689 \n",
" 4 0 3530 \n",
" 5 0 3904 \n",
" 6 0 4551 \n",
" 7 0 5295 \n",
" 8 0 2766 \n",
"\n",
" table_wrap_caption_title_count fig_count fig_label_count \\\n",
"year month \n",
"2017 0 20936 24384 23942 \n",
" 1 1103 1507 1495 \n",
" 2 1414 1564 1547 \n",
" 3 2548 3106 3098 \n",
" 4 2211 2502 2456 \n",
" 5 2145 2418 2400 \n",
" 6 2712 3560 3540 \n",
" 7 2064 2434 2420 \n",
" 8 2536 3127 3110 \n",
" 9 1928 2548 2523 \n",
" 10 2115 2779 2772 \n",
" 11 1928 2154 2152 \n",
" 12 2599 3353 3320 \n",
"2018 0 21018 23669 23128 \n",
" 1 1305 2124 2118 \n",
" 2 1983 2463 2456 \n",
" 3 2460 3025 3014 \n",
" 4 1928 2483 2451 \n",
" 5 2128 2857 2831 \n",
" 6 2358 3235 3231 \n",
" 7 2910 4593 4585 \n",
" 8 2125 2814 2802 \n",
" 9 1768 1918 1913 \n",
" 10 3178 4046 4042 \n",
" 11 2703 3309 3282 \n",
" 12 2101 2925 2906 \n",
"2019 0 2672 3199 3162 \n",
" 1 1561 1899 1887 \n",
" 2 2325 2430 2423 \n",
" 3 1685 2353 2301 \n",
" 4 3640 4342 4318 \n",
" 5 4274 4819 4769 \n",
" 6 3618 4458 4436 \n",
" 7 4243 4547 4511 \n",
" 8 4043 4706 4627 \n",
" 9 4915 5964 5903 \n",
" 10 4814 5061 5040 \n",
" 11 4800 5618 5540 \n",
" 12 5095 5992 5947 \n",
" 109 0 4 4 \n",
"2020 0 4 8 8 \n",
" 1 2541 3454 3350 \n",
" 2 2697 2689 2668 \n",
" 3 3665 4318 4281 \n",
" 4 3530 4686 4625 \n",
" 5 3889 4681 4655 \n",
" 6 4545 5439 5390 \n",
" 7 5295 6698 6646 \n",
" 8 2772 3659 3630 \n",
"\n",
" fig_caption_title_count has_subarticle_translation ... \\\n",
"year month ... \n",
"2017 0 23923 2605.0 ... \n",
" 1 1493 111.0 ... \n",
" 2 1550 107.0 ... \n",
" 3 3102 196.0 ... \n",
" 4 2464 178.0 ... \n",
" 5 2401 140.0 ... \n",
" 6 3515 240.0 ... \n",
" 7 2421 199.0 ... \n",
" 8 3108 178.0 ... \n",
" 9 2520 209.0 ... \n",
" 10 2759 162.0 ... \n",
" 11 2152 166.0 ... \n",
" 12 3318 208.0 ... \n",
"2018 0 23114 2420.0 ... \n",
" 1 2115 115.0 ... \n",
" 2 2459 155.0 ... \n",
" 3 3013 204.0 ... \n",
" 4 2455 172.0 ... \n",
" 5 2828 236.0 ... \n",
" 6 3217 289.0 ... \n",
" 7 4585 259.0 ... \n",
" 8 2791 266.0 ... \n",
" 9 1908 192.0 ... \n",
" 10 4020 222.0 ... \n",
" 11 3294 276.0 ... \n",
" 12 2903 196.0 ... \n",
"2019 0 3165 325.0 ... \n",
" 1 1861 249.0 ... \n",
" 2 2417 296.0 ... \n",
" 3 2306 223.0 ... \n",
" 4 4299 430.0 ... \n",
" 5 4773 393.0 ... \n",
" 6 4433 370.0 ... \n",
" 7 4541 448.0 ... \n",
" 8 4668 497.0 ... \n",
" 9 5905 455.0 ... \n",
" 10 5037 517.0 ... \n",
" 11 5544 448.0 ... \n",
" 12 5943 699.0 ... \n",
" 109 4 0.0 ... \n",
"2020 0 8 0.0 ... \n",
" 1 3316 312.0 ... \n",
" 2 2667 334.0 ... \n",
" 3 4292 525.0 ... \n",
" 4 4570 366.0 ... \n",
" 5 4642 506.0 ... \n",
" 6 5364 596.0 ... \n",
" 7 6635 612.0 ... \n",
" 8 3624 375.0 ... \n",
"\n",
" subarticle_translation_table_wrap_label_count \\\n",
"year month \n",
"2017 0 4884 \n",
" 1 209 \n",
" 2 277 \n",
" 3 489 \n",
" 4 441 \n",
" 5 360 \n",
" 6 562 \n",
" 7 389 \n",
" 8 343 \n",
" 9 486 \n",
" 10 365 \n",
" 11 417 \n",
" 12 440 \n",
"2018 0 4297 \n",
" 1 199 \n",
" 2 396 \n",
" 3 411 \n",
" 4 358 \n",
" 5 560 \n",
" 6 542 \n",
" 7 574 \n",
" 8 540 \n",
" 9 394 \n",
" 10 519 \n",
" 11 661 \n",
" 12 454 \n",
"2019 0 639 \n",
" 1 487 \n",
" 2 630 \n",
" 3 500 \n",
" 4 839 \n",
" 5 770 \n",
" 6 694 \n",
" 7 960 \n",
" 8 933 \n",
" 9 939 \n",
" 10 1158 \n",
" 11 913 \n",
" 12 1350 \n",
" 109 0 \n",
"2020 0 0 \n",
" 1 633 \n",
" 2 683 \n",
" 3 1047 \n",
" 4 592 \n",
" 5 927 \n",
" 6 1205 \n",
" 7 1228 \n",
" 8 682 \n",
"\n",
" subarticle_translation_table_wrap_caption_title_count \\\n",
"year month \n",
"2017 0 4945 \n",
" 1 209 \n",
" 2 277 \n",
" 3 489 \n",
" 4 441 \n",
" 5 360 \n",
" 6 565 \n",
" 7 388 \n",
" 8 342 \n",
" 9 486 \n",
" 10 365 \n",
" 11 417 \n",
" 12 439 \n",
"2018 0 4306 \n",
" 1 199 \n",
" 2 396 \n",
" 3 411 \n",
" 4 359 \n",
" 5 560 \n",
" 6 541 \n",
" 7 581 \n",
" 8 540 \n",
" 9 394 \n",
" 10 519 \n",
" 11 660 \n",
" 12 454 \n",
"2019 0 639 \n",
" 1 488 \n",
" 2 630 \n",
" 3 500 \n",
" 4 839 \n",
" 5 772 \n",
" 6 694 \n",
" 7 962 \n",
" 8 934 \n",
" 9 939 \n",
" 10 1158 \n",
" 11 911 \n",
" 12 1351 \n",
" 109 0 \n",
"2020 0 0 \n",
" 1 633 \n",
" 2 683 \n",
" 3 1047 \n",
" 4 592 \n",
" 5 927 \n",
" 6 1205 \n",
" 7 1229 \n",
" 8 687 \n",
"\n",
" subarticle_translation_fig_count \\\n",
"year month \n",
"2017 0 3705 \n",
" 1 131 \n",
" 2 225 \n",
" 3 280 \n",
" 4 367 \n",
" 5 184 \n",
" 6 340 \n",
" 7 206 \n",
" 8 265 \n",
" 9 351 \n",
" 10 257 \n",
" 11 275 \n",
" 12 327 \n",
"2018 0 3010 \n",
" 1 178 \n",
" 2 357 \n",
" 3 260 \n",
" 4 234 \n",
" 5 403 \n",
" 6 421 \n",
" 7 365 \n",
" 8 415 \n",
" 9 252 \n",
" 10 287 \n",
" 11 414 \n",
" 12 388 \n",
"2019 0 433 \n",
" 1 335 \n",
" 2 397 \n",
" 3 321 \n",
" 4 574 \n",
" 5 654 \n",
" 6 506 \n",
" 7 767 \n",
" 8 790 \n",
" 9 704 \n",
" 10 573 \n",
" 11 748 \n",
" 12 1195 \n",
" 109 0 \n",
"2020 0 0 \n",
" 1 422 \n",
" 2 389 \n",
" 3 812 \n",
" 4 521 \n",
" 5 703 \n",
" 6 894 \n",
" 7 787 \n",
" 8 521 \n",
"\n",
" subarticle_translation_fig_label_count \\\n",
"year month \n",
"2017 0 3669 \n",
" 1 129 \n",
" 2 220 \n",
" 3 280 \n",
" 4 367 \n",
" 5 184 \n",
" 6 338 \n",
" 7 204 \n",
" 8 265 \n",
" 9 346 \n",
" 10 256 \n",
" 11 273 \n",
" 12 314 \n",
"2018 0 2995 \n",
" 1 178 \n",
" 2 357 \n",
" 3 258 \n",
" 4 234 \n",
" 5 396 \n",
" 6 419 \n",
" 7 365 \n",
" 8 410 \n",
" 9 252 \n",
" 10 283 \n",
" 11 407 \n",
" 12 387 \n",
"2019 0 432 \n",
" 1 333 \n",
" 2 395 \n",
" 3 320 \n",
" 4 566 \n",
" 5 650 \n",
" 6 505 \n",
" 7 753 \n",
" 8 785 \n",
" 9 701 \n",
" 10 569 \n",
" 11 739 \n",
" 12 1191 \n",
" 109 0 \n",
"2020 0 0 \n",
" 1 421 \n",
" 2 387 \n",
" 3 811 \n",
" 4 485 \n",
" 5 697 \n",
" 6 888 \n",
" 7 783 \n",
" 8 514 \n",
"\n",
" subarticle_translation_fig_caption_title_count \\\n",
"year month \n",
"2017 0 3672 \n",
" 1 129 \n",
" 2 220 \n",
" 3 280 \n",
" 4 367 \n",
" 5 182 \n",
" 6 337 \n",
" 7 205 \n",
" 8 265 \n",
" 9 346 \n",
" 10 250 \n",
" 11 273 \n",
" 12 315 \n",
"2018 0 3000 \n",
" 1 178 \n",
" 2 357 \n",
" 3 258 \n",
" 4 234 \n",
" 5 376 \n",
" 6 419 \n",
" 7 365 \n",
" 8 414 \n",
" 9 252 \n",
" 10 283 \n",
" 11 408 \n",
" 12 388 \n",
"2019 0 432 \n",
" 1 333 \n",
" 2 395 \n",
" 3 320 \n",
" 4 561 \n",
" 5 649 \n",
" 6 506 \n",
" 7 754 \n",
" 8 785 \n",
" 9 702 \n",
" 10 569 \n",
" 11 743 \n",
" 12 1190 \n",
" 109 0 \n",
"2020 0 0 \n",
" 1 421 \n",
" 2 387 \n",
" 3 811 \n",
" 4 452 \n",
" 5 697 \n",
" 6 888 \n",
" 7 784 \n",
" 8 515 \n",
"\n",
" percentage_html_tables percentage_tables_with_labels \\\n",
"year month \n",
"2017 0 87.7102 96.9858 \n",
" 1 86.1674 97.1806 \n",
" 2 85.2083 98.2639 \n",
" 3 78.8634 99.2215 \n",
" 4 79.9009 99.6395 \n",
" 5 81.5863 99.7217 \n",
" 6 82.1963 98.8325 \n",
" 7 79.3904 99.8549 \n",
" 8 84.9592 98.562 \n",
" 9 85.2255 99.9482 \n",
" 10 94.337 99.8112 \n",
" 11 94.9275 99.793 \n",
" 12 97.67 99.2743 \n",
"2018 0 95.595 97.2608 \n",
" 1 100.076 98.7121 \n",
" 2 98.6902 99.8992 \n",
" 3 99.2301 99.7164 \n",
" 4 99.224 99.5861 \n",
" 5 99.2499 99.7656 \n",
" 6 98.7331 99.5777 \n",
" 7 99.6565 99.7252 \n",
" 8 98.2743 99.1604 \n",
" 9 98.8562 96.1329 \n",
" 10 99.5606 99.7489 \n",
" 11 98.2288 99.7048 \n",
" 12 100 98.7765 \n",
"2019 0 97.8269 96.7765 \n",
" 1 99.7506 97.2569 \n",
" 2 98.9708 99.6569 \n",
" 3 100.177 99.6452 \n",
" 4 100.163 98.803 \n",
" 5 97.05 98.4098 \n",
" 6 97.7328 98.0297 \n",
" 7 99.9769 97.9205 \n",
" 8 97.8198 98.1589 \n",
" 9 96.9072 99.3127 \n",
" 10 99.8138 99.6689 \n",
" 11 97.8567 97.9792 \n",
" 12 97.0057 95.1276 \n",
" 109 \n",
"2020 0 100 100 \n",
" 1 100.883 97.429 \n",
" 2 96.4299 99.3007 \n",
" 3 99.7323 98.742 \n",
" 4 100.028 98.0011 \n",
" 5 97.1371 98.9106 \n",
" 6 99.9127 99.3234 \n",
" 7 97.3591 99.1759 \n",
" 8 100.035 96.6457 \n",
"\n",
" percentage_tables_with_captions_titles percentage_fig_with_labels \\\n",
"year month \n",
"2017 0 97.5355 98.187336 \n",
" 1 97.1806 99.203716 \n",
" 2 98.1944 98.913043 \n",
" 3 99.1826 99.742434 \n",
" 4 99.6395 98.161471 \n",
" 5 99.4898 99.255583 \n",
" 6 98.942 99.438202 \n",
" 7 99.8549 99.424815 \n",
" 8 98.562 99.456348 \n",
" 9 99.9482 99.018838 \n",
" 10 99.8112 99.748111 \n",
" 11 99.793 99.907149 \n",
" 12 99.2743 99.015807 \n",
"2018 0 97.2515 97.714310 \n",
" 1 98.8636 99.717514 \n",
" 2 99.8992 99.715794 \n",
" 3 99.6759 99.636364 \n",
" 4 99.7413 98.711236 \n",
" 5 99.7656 99.089954 \n",
" 6 99.5777 99.876352 \n",
" 7 99.9656 99.825822 \n",
" 8 99.1138 99.573561 \n",
" 9 96.2963 99.739312 \n",
" 10 99.7489 99.901137 \n",
" 11 99.7417 99.184044 \n",
" 12 98.8706 99.350427 \n",
"2019 0 96.7765 98.843389 \n",
" 1 97.3192 99.368088 \n",
" 2 99.6998 99.711934 \n",
" 3 99.6452 97.790055 \n",
" 4 99.0207 99.447259 \n",
" 5 98.502 98.962440 \n",
" 6 97.6518 99.506505 \n",
" 7 98.036 99.208269 \n",
" 8 97.9409 98.321292 \n",
" 9 99.3531 98.977197 \n",
" 10 99.6069 99.585062 \n",
" 11 97.9792 98.611606 \n",
" 12 90.2729 99.248999 \n",
" 109 100.000000 \n",
"2020 0 100 100.000000 \n",
" 1 97.5058 96.988998 \n",
" 2 99.2639 99.219041 \n",
" 3 98.0996 99.143122 \n",
" 4 98.0011 98.698250 \n",
" 5 98.5305 99.444563 \n",
" 6 99.1925 99.099099 \n",
" 7 99.1759 99.223649 \n",
" 8 96.8553 99.207434 \n",
"\n",
" percentage_fig_with_captions_titles \n",
"year month \n",
"2017 0 98.109416 \n",
" 1 99.071002 \n",
" 2 99.104859 \n",
" 3 99.871217 \n",
" 4 98.481215 \n",
" 5 99.296940 \n",
" 6 98.735955 \n",
" 7 99.465900 \n",
" 8 99.392389 \n",
" 9 98.901099 \n",
" 10 99.280317 \n",
" 11 99.907149 \n",
" 12 98.956159 \n",
"2018 0 97.655161 \n",
" 1 99.576271 \n",
" 2 99.837596 \n",
" 3 99.603306 \n",
" 4 98.872332 \n",
" 5 98.984949 \n",
" 6 99.443586 \n",
" 7 99.825822 \n",
" 8 99.182658 \n",
" 9 99.478624 \n",
" 10 99.357390 \n",
" 11 99.546691 \n",
" 12 99.247863 \n",
"2019 0 98.937168 \n",
" 1 97.998947 \n",
" 2 99.465021 \n",
" 3 98.002550 \n",
" 4 99.009673 \n",
" 5 99.045445 \n",
" 6 99.439210 \n",
" 7 99.868045 \n",
" 8 99.192520 \n",
" 9 99.010731 \n",
" 10 99.525785 \n",
" 11 98.682805 \n",
" 12 99.182243 \n",
" 109 100.000000 \n",
"2020 0 100.000000 \n",
" 1 96.004632 \n",
" 2 99.181852 \n",
" 3 99.397869 \n",
" 4 97.524541 \n",
" 5 99.166845 \n",
" 6 98.621070 \n",
" 7 99.059421 \n",
" 8 99.043454 \n",
"\n",
"[49 rows x 21 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = (\n",
" df.groupby([\"year\", \"month\"]).sum()\n",
" .assign(\n",
" percentage_html_tables=lambda df: (df.table_wrap_table_count / df.table_wrap_count) * 100,\n",
" percentage_tables_with_labels=lambda df: (df.table_wrap_label_count / df.table_wrap_count) * 100,\n",
" percentage_tables_with_captions_titles=lambda df: (df.table_wrap_caption_title_count / df.table_wrap_count) * 100,\n",
" percentage_fig_with_labels=lambda df: (df.fig_label_count / df.fig_count) * 100,\n",
" percentage_fig_with_captions_titles=lambda df: (df.fig_caption_title_count / df.fig_count) * 100,\n",
" ) \n",
").fillna(\"\")\n",
"# Removeremos da análise os meses 9, 10 e 11 de 2020\n",
"df = df.drop(df.index[-3:])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"total_ativos_agrupados_2017-2020.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>total_docs</th>\n",
" <td>82716.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>total_tables</th>\n",
" <td>173976.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>total_figs</th>\n",
" <td>205921.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>percentage_html_tables</th>\n",
" <td>95.054490</td>\n",
" </tr>\n",
" <tr>\n",
" <th>percentage_tables_with_labels</th>\n",
" <td>98.298616</td>\n",
" </tr>\n",
" <tr>\n",
" <th>percentage_tables_with_captions_titles</th>\n",
" <td>98.189980</td>\n",
" </tr>\n",
" <tr>\n",
" <th>percentage_fig_with_labels</th>\n",
" <td>98.899578</td>\n",
" </tr>\n",
" <tr>\n",
" <th>percentage_fig_with_captions_titles</th>\n",
" <td>98.796140</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0\n",
"total_docs 82716.000000\n",
"total_tables 173976.000000\n",
"total_figs 205921.000000\n",
"percentage_html_tables 95.054490\n",
"percentage_tables_with_labels 98.298616\n",
"percentage_tables_with_captions_titles 98.189980\n",
"percentage_fig_with_labels 98.899578\n",
"percentage_fig_with_captions_titles 98.796140"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"overall_percentages = {\n",
" \"total_docs\": df[\"total_docs\"].sum(),\n",
" \"total_tables\": df[\"table_wrap_count\"].sum(),\n",
" \"total_figs\": df[\"fig_count\"].sum(),\n",
" \"percentage_html_tables\": [(df[\"table_wrap_table_count\"].sum() / df[\"table_wrap_count\"].sum()) * 100],\n",
" \"percentage_tables_with_labels\": [(df[\"table_wrap_label_count\"].sum() / df[\"table_wrap_count\"].sum()) * 100],\n",
" \"percentage_tables_with_captions_titles\": [(df[\"table_wrap_caption_title_count\"].sum() / df[\"table_wrap_count\"].sum()) * 100],\n",
" \"percentage_fig_with_labels\": [(df[\"fig_label_count\"].sum() / df[\"fig_count\"].sum()) * 100],\n",
" \"percentage_fig_with_captions_titles\": [(df[\"fig_caption_title_count\"].sum() / df[\"fig_count\"].sum()) * 100],\n",
"}\n",
"df_overall_percentages = pd.DataFrame.from_dict(overall_percentages)\n",
"df_overall_percentages.T"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Na tabela acima mês de valor `0` corresponde aos XMLs que não possuem a indicação do mês de publicação. Isso pode ocorrer devido a não obrigatoriedade do metadado.\n",
"\n",
"Em uma análise preliminar sobre aspectos de acessibilidade em 82716 documentos publicados entre janeiro de 2017 e agosto de 2020, dentre 173.976 tabelas, 95,05% estão codificadas em HTML, 98,29% apresentam rótulo descritivo e 98,18% legenda. Ainda no mesmo conjunto de documentos, dentre 205.921 figuras, 98,89% apresentam rótulo descritivo e 98,79% legenda."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment