Created
August 12, 2020 17:56
-
-
Save chetanambi/c8f029c7b18759a3abb57a7e55f8ede5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import camelot" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tables = camelot.read_pdf('https://trade.indiabulls.com/pdf/NSE_Holidays_Equity.pdf')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Numbers of tables" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2\n" | |
] | |
} | |
], | |
"source": [ | |
"print(tables.n)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Parsing report" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'accuracy': 100.0, 'whitespace': 0.0, 'order': 1, 'page': 1}\n" | |
] | |
} | |
], | |
"source": [ | |
"print(tables[0].parsing_report)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'accuracy': 100.0, 'whitespace': 0.0, 'order': 1, 'page': 1}\n", | |
"{'accuracy': 100.0, 'whitespace': 0.0, 'order': 2, 'page': 1}\n" | |
] | |
} | |
], | |
"source": [ | |
"for i in range(tables.n):\n", | |
" print(tables[i].parsing_report)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Table shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<Table shape=(13, 4)>\n" | |
] | |
} | |
], | |
"source": [ | |
"print(tables[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<Table shape=(13, 4)>\n", | |
"<Table shape=(8, 4)>\n" | |
] | |
} | |
], | |
"source": [ | |
"for i in range(tables.n):\n", | |
" print(tables[i])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Export the tables" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tables[0].to_csv('nse_holiday_list_table1.csv') # to_json, to_excel, to_html, to_sqlite" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Export to pandas dataframe" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>3</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Sr. No.</td>\n", | |
" <td>Date</td>\n", | |
" <td>Day</td>\n", | |
" <td>Description</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>February 21, 2020</td>\n", | |
" <td>Friday</td>\n", | |
" <td>Mahashivratri</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td>March 10, 2020</td>\n", | |
" <td>Tuesday</td>\n", | |
" <td>Holi</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3</td>\n", | |
" <td>April 02,2020</td>\n", | |
" <td>Thursday</td>\n", | |
" <td>Ram Navami</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4</td>\n", | |
" <td>April 06,2020</td>\n", | |
" <td>Monday</td>\n", | |
" <td>Mahavir Jayanti</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>5</td>\n", | |
" <td>April 10, 2020</td>\n", | |
" <td>Friday</td>\n", | |
" <td>Good Friday</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>6</td>\n", | |
" <td>April 14,2020</td>\n", | |
" <td>Tuesday</td>\n", | |
" <td>Dr.Baba Saheb Ambedkar Jayanti</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>7</td>\n", | |
" <td>May 01,2020</td>\n", | |
" <td>Friday</td>\n", | |
" <td>Maharashtra Day</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>8</td>\n", | |
" <td>May 25, 2020</td>\n", | |
" <td>Monday</td>\n", | |
" <td>Id-Ul-Fitr (Ramzan ID)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>9</td>\n", | |
" <td>October 02, 2020</td>\n", | |
" <td>Friday</td>\n", | |
" <td>Mahatma Gandhi Jayanti</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>10</td>\n", | |
" <td>November 16, 2020</td>\n", | |
" <td>Monday</td>\n", | |
" <td>Diwali-Balipratipada</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>11</td>\n", | |
" <td>November 30, 2020</td>\n", | |
" <td>Monday</td>\n", | |
" <td>Gurunanak Jayanti</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>12</td>\n", | |
" <td>December 25, 2020</td>\n", | |
" <td>Friday</td>\n", | |
" <td>Christmas</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" 0 1 2 3\n", | |
"0 Sr. No. Date Day Description\n", | |
"1 1 February 21, 2020 Friday Mahashivratri\n", | |
"2 2 March 10, 2020 Tuesday Holi\n", | |
"3 3 April 02,2020 Thursday Ram Navami\n", | |
"4 4 April 06,2020 Monday Mahavir Jayanti\n", | |
"5 5 April 10, 2020 Friday Good Friday\n", | |
"6 6 April 14,2020 Tuesday Dr.Baba Saheb Ambedkar Jayanti\n", | |
"7 7 May 01,2020 Friday Maharashtra Day\n", | |
"8 8 May 25, 2020 Monday Id-Ul-Fitr (Ramzan ID)\n", | |
"9 9 October 02, 2020 Friday Mahatma Gandhi Jayanti\n", | |
"10 10 November 16, 2020 Monday Diwali-Balipratipada\n", | |
"11 11 November 30, 2020 Monday Gurunanak Jayanti\n", | |
"12 12 December 25, 2020 Friday Christmas" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tables[0].df " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Sr. No.</th>\n", | |
" <th>Date</th>\n", | |
" <th>Day</th>\n", | |
" <th>Description</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>February 21, 2020</td>\n", | |
" <td>Friday</td>\n", | |
" <td>Mahashivratri</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td>March 10, 2020</td>\n", | |
" <td>Tuesday</td>\n", | |
" <td>Holi</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3</td>\n", | |
" <td>April 02,2020</td>\n", | |
" <td>Thursday</td>\n", | |
" <td>Ram Navami</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4</td>\n", | |
" <td>April 06,2020</td>\n", | |
" <td>Monday</td>\n", | |
" <td>Mahavir Jayanti</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>5</td>\n", | |
" <td>April 10, 2020</td>\n", | |
" <td>Friday</td>\n", | |
" <td>Good Friday</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>6</td>\n", | |
" <td>April 14,2020</td>\n", | |
" <td>Tuesday</td>\n", | |
" <td>Dr.Baba Saheb Ambedkar Jayanti</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>7</td>\n", | |
" <td>May 01,2020</td>\n", | |
" <td>Friday</td>\n", | |
" <td>Maharashtra Day</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>8</td>\n", | |
" <td>May 25, 2020</td>\n", | |
" <td>Monday</td>\n", | |
" <td>Id-Ul-Fitr (Ramzan ID)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>9</td>\n", | |
" <td>October 02, 2020</td>\n", | |
" <td>Friday</td>\n", | |
" <td>Mahatma Gandhi Jayanti</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>10</td>\n", | |
" <td>November 16, 2020</td>\n", | |
" <td>Monday</td>\n", | |
" <td>Diwali-Balipratipada</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>11</td>\n", | |
" <td>November 30, 2020</td>\n", | |
" <td>Monday</td>\n", | |
" <td>Gurunanak Jayanti</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>12</td>\n", | |
" <td>December 25, 2020</td>\n", | |
" <td>Friday</td>\n", | |
" <td>Christmas</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Sr. No. Date Day Description\n", | |
"1 1 February 21, 2020 Friday Mahashivratri\n", | |
"2 2 March 10, 2020 Tuesday Holi\n", | |
"3 3 April 02,2020 Thursday Ram Navami\n", | |
"4 4 April 06,2020 Monday Mahavir Jayanti\n", | |
"5 5 April 10, 2020 Friday Good Friday\n", | |
"6 6 April 14,2020 Tuesday Dr.Baba Saheb Ambedkar Jayanti\n", | |
"7 7 May 01,2020 Friday Maharashtra Day\n", | |
"8 8 May 25, 2020 Monday Id-Ul-Fitr (Ramzan ID)\n", | |
"9 9 October 02, 2020 Friday Mahatma Gandhi Jayanti\n", | |
"10 10 November 16, 2020 Monday Diwali-Balipratipada\n", | |
"11 11 November 30, 2020 Monday Gurunanak Jayanti\n", | |
"12 12 December 25, 2020 Friday Christmas" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"temp_df = tables[0].df \n", | |
"temp_df.rename(columns=temp_df.iloc[0]).drop(temp_df.index[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>3</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Sr. No.</td>\n", | |
" <td>Date</td>\n", | |
" <td>Day</td>\n", | |
" <td>Description</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>January 26, 2020</td>\n", | |
" <td>Sunday</td>\n", | |
" <td>Republic Day</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td>August 01, 2020</td>\n", | |
" <td>Saturday</td>\n", | |
" <td>Bakri Id</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3</td>\n", | |
" <td>August 15, 2020</td>\n", | |
" <td>Saturday</td>\n", | |
" <td>Independence Day</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4</td>\n", | |
" <td>August 22, 2020</td>\n", | |
" <td>Saturday</td>\n", | |
" <td>Ganesh Chaturthi</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>5</td>\n", | |
" <td>August 30, 2020</td>\n", | |
" <td>Sunday</td>\n", | |
" <td>Moharram</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>6</td>\n", | |
" <td>October 25, 2020</td>\n", | |
" <td>Sunday</td>\n", | |
" <td>Dasera</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>7</td>\n", | |
" <td>November 14, 2020</td>\n", | |
" <td>Saturday</td>\n", | |
" <td>Diwali-Laxmi Pujan*</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" 0 1 2 3\n", | |
"0 Sr. No. Date Day Description\n", | |
"1 1 January 26, 2020 Sunday Republic Day\n", | |
"2 2 August 01, 2020 Saturday Bakri Id\n", | |
"3 3 August 15, 2020 Saturday Independence Day\n", | |
"4 4 August 22, 2020 Saturday Ganesh Chaturthi\n", | |
"5 5 August 30, 2020 Sunday Moharram\n", | |
"6 6 October 25, 2020 Sunday Dasera\n", | |
"7 7 November 14, 2020 Saturday Diwali-Laxmi Pujan*" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tables[1].df # get a pandas DataFrame!" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Sr. No.</th>\n", | |
" <th>Date</th>\n", | |
" <th>Day</th>\n", | |
" <th>Description</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>January 26, 2020</td>\n", | |
" <td>Sunday</td>\n", | |
" <td>Republic Day</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td>August 01, 2020</td>\n", | |
" <td>Saturday</td>\n", | |
" <td>Bakri Id</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3</td>\n", | |
" <td>August 15, 2020</td>\n", | |
" <td>Saturday</td>\n", | |
" <td>Independence Day</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4</td>\n", | |
" <td>August 22, 2020</td>\n", | |
" <td>Saturday</td>\n", | |
" <td>Ganesh Chaturthi</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>5</td>\n", | |
" <td>August 30, 2020</td>\n", | |
" <td>Sunday</td>\n", | |
" <td>Moharram</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>6</td>\n", | |
" <td>October 25, 2020</td>\n", | |
" <td>Sunday</td>\n", | |
" <td>Dasera</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>7</td>\n", | |
" <td>November 14, 2020</td>\n", | |
" <td>Saturday</td>\n", | |
" <td>Diwali-Laxmi Pujan*</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Sr. No. Date Day Description\n", | |
"1 1 January 26, 2020 Sunday Republic Day\n", | |
"2 2 August 01, 2020 Saturday Bakri Id\n", | |
"3 3 August 15, 2020 Saturday Independence Day\n", | |
"4 4 August 22, 2020 Saturday Ganesh Chaturthi\n", | |
"5 5 August 30, 2020 Sunday Moharram\n", | |
"6 6 October 25, 2020 Sunday Dasera\n", | |
"7 7 November 14, 2020 Saturday Diwali-Laxmi Pujan*" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"temp_df = tables[1].df \n", | |
"temp_df.rename(columns=temp_df.iloc[0]).drop(temp_df.index[0])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.6" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": false, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": true | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment