Created
June 11, 2021 10:44
-
-
Save rishi-a/3aa658f8ee4b983c7297e39eed36d4b8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Requirement already satisfied: tabula-py in c:\\programdata\\anaconda3\\lib\\site-packages (2.2.0)\n", | |
"Requirement already satisfied: distro in c:\\programdata\\anaconda3\\lib\\site-packages (from tabula-py) (1.5.0)\n", | |
"Requirement already satisfied: pandas>=0.25.3 in c:\\programdata\\anaconda3\\lib\\site-packages (from tabula-py) (1.1.0)\n", | |
"Requirement already satisfied: numpy in c:\\programdata\\anaconda3\\lib\\site-packages (from tabula-py) (1.19.1)\n", | |
"Requirement already satisfied: python-dateutil>=2.7.3 in c:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=0.25.3->tabula-py) (2.8.1)\n", | |
"Requirement already satisfied: pytz>=2017.2 in c:\\programdata\\anaconda3\\lib\\site-packages (from pandas>=0.25.3->tabula-py) (2018.4)\n", | |
"Requirement already satisfied: six>=1.5 in c:\\programdata\\anaconda3\\lib\\site-packages (from python-dateutil>=2.7.3->pandas>=0.25.3->tabula-py) (1.11.0)\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"WARNING: You are using pip version 19.2.3, however version 21.1.2 is available.\n", | |
"You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n" | |
] | |
} | |
], | |
"source": [ | |
"!pip install tabula-py" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from tabula import read_pdf\n", | |
"\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"frames = []\n", | |
"pages = 2\n", | |
"for page in range(1,pages+1):\n", | |
" content = read_pdf('C:/Users/Rishiraj/Desktop/AQI_Bulletin_20210107.pdf', pages=page)\n", | |
" df = content[0][::2] #skip Nan rows\n", | |
" df = df.iloc[1:] #skip first row\n", | |
" frames.append(df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 61, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"finalDf = pd.concat(frames)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 67, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"finalDf.columns = ['index','city', 'air quality','index value','pollutant','monitoring stations']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"finalDf.to_csv('C:/Users/Rishiraj/Desktop/AQI_Bulletin_20210107.csv')\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## pdftables_api\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 92, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pdftables_api\n", | |
"\n", | |
"#paid tool, but useless\n", | |
" \n", | |
"# API KEY VERIFICATION\n", | |
"conversion = pdftables_api.Client('')\n", | |
" \n", | |
"# PDf to CSV \n", | |
"# (Hello.pdf, Hello)\n", | |
"conversion.csv('C:/Users/Rishiraj/Desktop/AQI_Bulletin_20210107.pdf', 'C:/Users/Rishiraj/Desktop/AQI_Bulletin_20210107_Paid.csv')\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"hide_input": false, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment