Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
A jupyter notebook showing how Camelot can be used to extract tables from PDFs scraped from the IDSP website.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tracking disease outbreaks in India using Camelot\n",
"\n",
"The code here builds upon this [blog post](https://blog.socialcops.com/technology/data-science/apache-airflow-disease-outbreaks-india/). It scrapes PDFs that contain disease outbreaks data from the [IDSP website](http://idsp.nic.in/index4.php?lang=1&level=0&linkid=406&lid=3689) based on the year, from_week and to_week variables in the first cell; then extracts tables from those PDFs using Camelot.",
"\n",
"EDIT: Currently, this notebook works only with Python 2.7, I'll update it soon to work with both Python 2 and 3."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import re\n",
"\n",
"\n",
"def mkdir(path):\n",
" \"\"\"Make directory\n",
" \n",
" Parameters\n",
" ----------\n",
" path : str\n",
" \n",
" \"\"\"\n",
" if not os.path.exists(path):\n",
" os.makedirs(path)\n",
"\n",
"\n",
"# replace vinayak with your username\n",
"data_dir = '/home/vinayak/dev/etl-camelot/data'\n",
"year = 2018\n",
"from_week = to_week = 33"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from lxml import etree\n",
" \n",
" \n",
"def scrape_web(year=2018, from_week=1, to_week=52):\n",
" \"\"\"Scrape PDFs from the IDSP website\n",
" http://idsp.nic.in/index4.php?lang=1&level=0&linkid=406&lid=3689\n",
" \n",
" Parameters\n",
" ----------\n",
" year : int\n",
" from_week : int\n",
" to_week : int\n",
" \n",
" \"\"\"\n",
" year_dir = os.path.join(data_dir, str(year))\n",
" mkdir(year_dir)\n",
"\n",
" crawl = {}\n",
" r = requests.get('http://idsp.nic.in/index4.php?lang=1&level=0&linkid=406&lid=3689')\n",
" tree = etree.fromstring(r.content, etree.HTMLParser())\n",
" table = tree.xpath('//*[@id=\"cmscontent\"]')\n",
" rows = table[0].cssselect('tr')\n",
" for r in rows[1:]:\n",
" try:\n",
" y = int(r.cssselect('td')[0].cssselect('span')[0].xpath('text()')[0])\n",
" except IndexError:\n",
" y = int(r.cssselect('td')[0].cssselect('div')[0].xpath('text()')[0])\n",
" crawl[y] = {}\n",
" links = r.cssselect('td')[1].cssselect('a')\n",
" for l in links:\n",
" try:\n",
" m = re.search(r'\\d+', l.xpath('text()')[0])\n",
" except IndexError:\n",
" m = re.search(r'\\d+', l.cssselect('span')[0].xpath('text()')[0])\n",
" week = int(m.group(0))\n",
" link = l.xpath('@href')[0]\n",
" crawl[y][week] = link\n",
"\n",
" to_download = filter(lambda x: from_week <= x <= to_week, crawl[year])\n",
" to_download = sorted(to_download)\n",
" print 'Found {} PDF(s) for download'.format(len(to_download))\n",
" if len(to_download):\n",
" for w in to_download:\n",
" print 'Downloading year {}, week {} ...'.format(year, w)\n",
" link = crawl[year][w]\n",
" r = requests.get(link, stream=True)\n",
" filename = os.path.join(year_dir, '{}.pdf'.format(w))\n",
" with open(filename, 'wb') as f:\n",
" for chunk in r.iter_content(1024):\n",
" f.write(chunk)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 1 PDF(s) for download\n",
"Downloading year 2018, week 33 ...\n"
]
}
],
"source": [
"scrape_web(year=2018, from_week=from_week, to_week=to_week)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import camelot\n",
"\n",
"\n",
"def extract_tables(year=2018, from_week=1, to_week=52):\n",
" \"\"\"Extract tables from downloaded PDFs using Camelot\n",
" \n",
" Parameters\n",
" ----------\n",
" year : int\n",
" from_week : int\n",
" to_week : int\n",
" \n",
" \"\"\"\n",
" year_dir = os.path.join(data_dir, str(year))\n",
" if not os.path.exists(year_dir):\n",
" print 'Year {} not found'.format(year)\n",
" return None\n",
"\n",
" all_tables = []\n",
" for i in range(from_week, to_week + 1):\n",
" filename = '{}.pdf'.format(i)\n",
" filename = os.path.join(year_dir, filename)\n",
" print 'Processing {} ...'.format(filename)\n",
" tables = camelot.read_pdf(filename, pages='3-end', line_size_scaling=40)\n",
" print 'Found {} tables(s)'.format(tables.n)\n",
" all_tables.append(tables)\n",
" \n",
" return all_tables"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing /home/vinayak/dev/etl-camelot/data/2018/33.pdf ...\n",
"Found 7 tables(s)\n"
]
}
],
"source": [
"all_tables = extract_tables(year=2018, from_week=from_week, to_week=to_week)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"\n",
"headers = [\n",
" 'unique_id',\n",
" 'state',\n",
" 'district',\n",
" 'disease_illness',\n",
" 'num_cases',\n",
" 'num_deaths',\n",
" 'date_of_start_of_outbreak',\n",
" 'date_of_reporting',\n",
" 'current_status',\n",
" 'comment_action_taken',\n",
" 'reported_late',\n",
" 'under_surveillance'\n",
"]\n",
"ten_headers = [\n",
" 'unique_id',\n",
" 'state',\n",
" 'district',\n",
" 'disease_illness',\n",
" 'num_cases',\n",
" 'num_deaths',\n",
" 'date_of_start_of_outbreak',\n",
" 'date_of_reporting',\n",
" 'current_status',\n",
" 'comment_action_taken'\n",
"]\n",
"nine_headers = [\n",
" 'unique_id',\n",
" 'state',\n",
" 'district',\n",
" 'disease_illness',\n",
" 'num_cases',\n",
" 'num_deaths',\n",
" 'date_of_start_of_outbreak',\n",
" 'current_status',\n",
" 'comment_action_taken'\n",
"]\n",
"\n",
"\n",
"def append_tables(all_tables):\n",
" \"\"\"Append all tables in PDFs\n",
" \n",
" Parameters\n",
" ----------\n",
" all_tables : list\n",
" \n",
" \"\"\"\n",
" df = pd.DataFrame(columns=headers)\n",
" for tables in all_tables:\n",
" for table in tables:\n",
" columns = list(table.df.iloc[0])\n",
" if table.shape[1] == 10:\n",
" temp = table.df.copy()\n",
" if 'unique' in columns[0].lower():\n",
" temp = temp.iloc[1:]\n",
" temp.columns = ten_headers\n",
" temp['reported_late'] = False\n",
" temp['under_surveillance'] = False\n",
" df = pd.concat([df, temp], sort=False)\n",
" elif table.shape[1] == 9:\n",
" temp = table.df.copy()\n",
" if 'disease' in columns[0].lower():\n",
" c = temp.iloc[0]\n",
" temp = temp.iloc[2:]\n",
" temp.columns = nine_headers\n",
" if 'reportedlate' in c[0].lower().replace(' ', ''): \n",
" temp['reported_late'] = True\n",
" temp['under_surveillance'] = False\n",
" elif 'undersurv' in c[0].lower().replace(' ', ''):\n",
" temp['reported_late'] = False\n",
" temp['under_surveillance'] = True\n",
" df = pd.concat([df, temp], sort=False)\n",
" else:\n",
" temp.columns = nine_headers\n",
" temp['reported_late'] = True\n",
" temp['under_surveillance'] = False\n",
" df = pd.concat([df, temp], sort=False)\n",
" return df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The append logic above is just a heuristic, which would need to change if the table structure changes in future."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>unique_id</th>\n",
" <th>state</th>\n",
" <th>district</th>\n",
" <th>disease_illness</th>\n",
" <th>num_cases</th>\n",
" <th>num_deaths</th>\n",
" <th>date_of_start_of_outbreak</th>\n",
" <th>date_of_reporting</th>\n",
" <th>current_status</th>\n",
" <th>comment_action_taken</th>\n",
" <th>reported_late</th>\n",
" <th>under_surveillance</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AR/NMI/2018/33/1159</td>\n",
" <td>Arunachal Pradesh</td>\n",
" <td>Namsai</td>\n",
" <td>Japanese Encephalitis</td>\n",
" <td>01</td>\n",
" <td>00</td>\n",
" <td>16-08-18</td>\n",
" <td>20-18-18</td>\n",
" <td>Under Surveillance</td>\n",
" <td>Cases of fever with encephalitis reported from...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>JH/RCH/2018/33/1160</td>\n",
" <td>Jharkhand</td>\n",
" <td>Ranchi</td>\n",
" <td>Food Poisoning</td>\n",
" <td>10</td>\n",
" <td>00</td>\n",
" <td>13-08-18</td>\n",
" <td>13-08-18</td>\n",
" <td>Under Control</td>\n",
" <td>Cases of vomiting with pain abdomen reported f...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>KL/KOZ/2018/33/1161</td>\n",
" <td>Kerala</td>\n",
" <td>Kozhikode</td>\n",
" <td>Viral Hepatitis A</td>\n",
" <td>26</td>\n",
" <td>00</td>\n",
" <td>16-08-18</td>\n",
" <td>20-08-18</td>\n",
" <td>Under Surveillance</td>\n",
" <td>Cases of jaundice with fever reported fro...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>PB/PTL/2018/33/1162</td>\n",
" <td>Punjab</td>\n",
" <td>Patiala</td>\n",
" <td>Acute Diarrheal Diseases</td>\n",
" <td>71</td>\n",
" <td>00</td>\n",
" <td>14-08-18</td>\n",
" <td>18-08-18</td>\n",
" <td>Under Surveillance</td>\n",
" <td>Cases of loose stools with vomiting repo...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PB/MUK/2018/33/1163</td>\n",
" <td>Punjab</td>\n",
" <td>Muktsar</td>\n",
" <td>Viral Hepatitis E</td>\n",
" <td>30</td>\n",
" <td>00</td>\n",
" <td>17-08-18</td>\n",
" <td>23-08-18</td>\n",
" <td>Under Surveillance</td>\n",
" <td>Cases of jaundice reported from Gidderbah...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>RJ/DGR/2018/33/1164</td>\n",
" <td>Rajasthan</td>\n",
" <td>Dungarpur</td>\n",
" <td>Acute Diarrheal Diseases</td>\n",
" <td>10</td>\n",
" <td>01</td>\n",
" <td>21-08-18</td>\n",
" <td>21-08-18</td>\n",
" <td>Under Control</td>\n",
" <td>Cases of loose stools with vomiting repo...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>TN/TNR/2018/34/1165</td>\n",
" <td>Tamil Nadu</td>\n",
" <td>Tiruvannamalai</td>\n",
" <td>Viral Hepatitis A</td>\n",
" <td>11</td>\n",
" <td>00</td>\n",
" <td>16-08-18</td>\n",
" <td>20-08-18</td>\n",
" <td>Under Control</td>\n",
" <td>Cases of jaundice reported from Village Kolun...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>TN/ERD/2018/34/1166</td>\n",
" <td>Tamil Nadu</td>\n",
" <td>Erode</td>\n",
" <td>Food Borne Illness</td>\n",
" <td>588</td>\n",
" <td>00</td>\n",
" <td>15-08-18</td>\n",
" <td>16-08-18</td>\n",
" <td>Under Surveillance</td>\n",
" <td>Cases of vomiting with loose stools repo...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>TL/KRN/2018/33/1167</td>\n",
" <td>Telangana</td>\n",
" <td>Krimnagar</td>\n",
" <td>Dengue</td>\n",
" <td>06</td>\n",
" <td>00</td>\n",
" <td>13-08-18</td>\n",
" <td>21-08-18</td>\n",
" <td>Under Control</td>\n",
" <td>Cases of fever reported from Village Patarlapa...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>WB/MDW/2018/33/1168</td>\n",
" <td>West Bengal</td>\n",
" <td>Paschim Medinipur</td>\n",
" <td>Dengue</td>\n",
" <td>497</td>\n",
" <td>00</td>\n",
" <td>06-08-18</td>\n",
" <td>06-08-18</td>\n",
" <td>Under Surveillance</td>\n",
" <td>Cases of fever with rash reported from Village...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>KN/CKR/2018/33/1169</td>\n",
" <td>Karnataka</td>\n",
" <td>Chickballapura</td>\n",
" <td>Chikungunya</td>\n",
" <td>15</td>\n",
" <td>00</td>\n",
" <td>06-08-18</td>\n",
" <td>NaN</td>\n",
" <td>Under Control</td>\n",
" <td>Cases of fever with joint pains reported fr...</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>KN/TUM/2018/33/1170</td>\n",
" <td>Karnataka</td>\n",
" <td>Tumkur</td>\n",
" <td>Chickenpox</td>\n",
" <td>28</td>\n",
" <td>00</td>\n",
" <td>06-08-18</td>\n",
" <td>NaN</td>\n",
" <td>Under Surveillance</td>\n",
" <td>Cases of fever with rash reported from ...</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>KL/CNR/2018/33/1171</td>\n",
" <td>Kerala</td>\n",
" <td>Kannur</td>\n",
" <td>Viral Hepatitis</td>\n",
" <td>54</td>\n",
" <td>00</td>\n",
" <td>03-08-18</td>\n",
" <td>NaN</td>\n",
" <td>Under Surveillance</td>\n",
" <td>Cases of jaundice with fever reported from V...</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>MN/TBL/2018/33/1172</td>\n",
" <td>Manipur</td>\n",
" <td>Thoubal</td>\n",
" <td>Japanese Encephalitis</td>\n",
" <td>02</td>\n",
" <td>00</td>\n",
" <td>02-07-18</td>\n",
" <td>NaN</td>\n",
" <td>Under Surveillance</td>\n",
" <td>Cases of fever with encephalitis reported f...</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>OR/KHU/2018/33/1173</td>\n",
" <td>Odisha</td>\n",
" <td>Khordha</td>\n",
" <td>Viral Hepatitis</td>\n",
" <td>08</td>\n",
" <td>00</td>\n",
" <td>14-07-18</td>\n",
" <td>NaN</td>\n",
" <td>Under Control</td>\n",
" <td>Cases of jaundice reported from Village ...</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>TN/VRN/2018/34/1174</td>\n",
" <td>Tamil Nadu</td>\n",
" <td>Virudhunagar-Sivakasi</td>\n",
" <td>Chikungunya</td>\n",
" <td>44</td>\n",
" <td>00</td>\n",
" <td>14-08-18</td>\n",
" <td>NaN</td>\n",
" <td>Under Surveillance</td>\n",
" <td>Cases of fever reported from Village/SC Ramasa...</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>WB/MRS/2018/30/1114</td>\n",
" <td>West Bengal</td>\n",
" <td>Murshidabad</td>\n",
" <td>Dengue</td>\n",
" <td>532</td>\n",
" <td>00</td>\n",
" <td>24-07-18</td>\n",
" <td>NaN</td>\n",
" <td>Under Surveillance</td>\n",
" <td>The outbreak was reported in the 30th w...</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>HP/BLR/2018/25/870</td>\n",
" <td>Himachal Pradesh</td>\n",
" <td>Bilaspur</td>\n",
" <td>Dengue</td>\n",
" <td></td>\n",
" <td>00</td>\n",
" <td>28-05-18</td>\n",
" <td>NaN</td>\n",
" <td>Under Surveillance</td>\n",
" <td>Outbreak of Dengue reported in 25th week of th...</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" unique_id state district \\\n",
"1 AR/NMI/2018/33/1159 Arunachal Pradesh Namsai \n",
"2 JH/RCH/2018/33/1160 Jharkhand Ranchi \n",
"3 KL/KOZ/2018/33/1161 Kerala Kozhikode \n",
"4 PB/PTL/2018/33/1162 Punjab Patiala \n",
"0 PB/MUK/2018/33/1163 Punjab Muktsar \n",
"1 RJ/DGR/2018/33/1164 Rajasthan Dungarpur \n",
"2 TN/TNR/2018/34/1165 Tamil Nadu Tiruvannamalai \n",
"0 TN/ERD/2018/34/1166 Tamil Nadu Erode \n",
"1 TL/KRN/2018/33/1167 Telangana Krimnagar \n",
"0 WB/MDW/2018/33/1168 West Bengal Paschim Medinipur \n",
"2 KN/CKR/2018/33/1169 Karnataka Chickballapura \n",
"0 KN/TUM/2018/33/1170 Karnataka Tumkur \n",
"1 KL/CNR/2018/33/1171 Kerala Kannur \n",
"2 MN/TBL/2018/33/1172 Manipur Thoubal \n",
"3 OR/KHU/2018/33/1173 Odisha Khordha \n",
"4 TN/VRN/2018/34/1174 Tamil Nadu Virudhunagar-Sivakasi \n",
"2 WB/MRS/2018/30/1114 West Bengal Murshidabad \n",
"3 HP/BLR/2018/25/870 Himachal Pradesh Bilaspur \n",
"\n",
" disease_illness num_cases num_deaths date_of_start_of_outbreak \\\n",
"1 Japanese Encephalitis 01 00 16-08-18 \n",
"2 Food Poisoning 10 00 13-08-18 \n",
"3 Viral Hepatitis A 26 00 16-08-18 \n",
"4 Acute Diarrheal Diseases 71 00 14-08-18 \n",
"0 Viral Hepatitis E 30 00 17-08-18 \n",
"1 Acute Diarrheal Diseases 10 01 21-08-18 \n",
"2 Viral Hepatitis A 11 00 16-08-18 \n",
"0 Food Borne Illness 588 00 15-08-18 \n",
"1 Dengue 06 00 13-08-18 \n",
"0 Dengue 497 00 06-08-18 \n",
"2 Chikungunya 15 00 06-08-18 \n",
"0 Chickenpox 28 00 06-08-18 \n",
"1 Viral Hepatitis 54 00 03-08-18 \n",
"2 Japanese Encephalitis 02 00 02-07-18 \n",
"3 Viral Hepatitis 08 00 14-07-18 \n",
"4 Chikungunya 44 00 14-08-18 \n",
"2 Dengue 532 00 24-07-18 \n",
"3 Dengue 00 28-05-18 \n",
"\n",
" date_of_reporting current_status \\\n",
"1 20-18-18 Under Surveillance \n",
"2 13-08-18 Under Control \n",
"3 20-08-18 Under Surveillance \n",
"4 18-08-18 Under Surveillance \n",
"0 23-08-18 Under Surveillance \n",
"1 21-08-18 Under Control \n",
"2 20-08-18 Under Control \n",
"0 16-08-18 Under Surveillance \n",
"1 21-08-18 Under Control \n",
"0 06-08-18 Under Surveillance \n",
"2 NaN Under Control \n",
"0 NaN Under Surveillance \n",
"1 NaN Under Surveillance \n",
"2 NaN Under Surveillance \n",
"3 NaN Under Control \n",
"4 NaN Under Surveillance \n",
"2 NaN Under Surveillance \n",
"3 NaN Under Surveillance \n",
"\n",
" comment_action_taken reported_late \\\n",
"1 Cases of fever with encephalitis reported from... False \n",
"2 Cases of vomiting with pain abdomen reported f... False \n",
"3 Cases of jaundice with fever reported fro... False \n",
"4 Cases of loose stools with vomiting repo... False \n",
"0 Cases of jaundice reported from Gidderbah... False \n",
"1 Cases of loose stools with vomiting repo... False \n",
"2 Cases of jaundice reported from Village Kolun... False \n",
"0 Cases of vomiting with loose stools repo... False \n",
"1 Cases of fever reported from Village Patarlapa... False \n",
"0 Cases of fever with rash reported from Village... False \n",
"2 Cases of fever with joint pains reported fr... True \n",
"0 Cases of fever with rash reported from ... True \n",
"1 Cases of jaundice with fever reported from V... True \n",
"2 Cases of fever with encephalitis reported f... True \n",
"3 Cases of jaundice reported from Village ... True \n",
"4 Cases of fever reported from Village/SC Ramasa... True \n",
"2 The outbreak was reported in the 30th w... False \n",
"3 Outbreak of Dengue reported in 25th week of th... False \n",
"\n",
" under_surveillance \n",
"1 False \n",
"2 False \n",
"3 False \n",
"4 False \n",
"0 False \n",
"1 False \n",
"2 False \n",
"0 False \n",
"1 False \n",
"0 False \n",
"2 False \n",
"0 False \n",
"1 False \n",
"2 False \n",
"3 False \n",
"4 False \n",
"2 True \n",
"3 True "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = append_tables(all_tables)\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now that you have a clean table, you can do further analysis, or just load it into a data warehouse for querying/visualization."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"filename = os.path.join(data_dir, 'data.csv')\n",
"df.to_csv(filename, index=False, quoting=1, encoding='utf-8')"
]
}
],
"metadata": {
"authors": [
{
"email": "vmehta94@gmail.com",
"name": "Vinayak Mehta"
}
],
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.15rc1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.