Last active
March 2, 2018 15:26
-
-
Save psychemedia/4094626f3b20d8d629381feb673c9c48 to your computer and use it in GitHub Desktop.
Quick demo of crude scrape of register of MP interests
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Collecting parse\n", | |
" Downloading parse-1.8.2.tar.gz\n", | |
"Building wheels for collected packages: parse\n", | |
" Running setup.py bdist_wheel for parse ... \u001b[?25ldone\n", | |
"\u001b[?25h Stored in directory: /home/jovyan/.cache/pip/wheels/b0/3b/37/4ab694a0d331a3cc487923ff9c3645d0f103e9cd4762065f77\n", | |
"Successfully built parse\n", | |
"Installing collected packages: parse\n", | |
"Successfully installed parse-1.8.2\n" | |
] | |
} | |
], | |
"source": [ | |
"#https://github.com/r1chardj0n3s/parse\n", | |
"!pip install parse" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Collecting requests_html\n", | |
" Downloading requests_html-0.7.2-py2.py3-none-any.whl\n", | |
"Collecting w3lib (from requests_html)\n", | |
" Downloading w3lib-1.19.0-py2.py3-none-any.whl\n", | |
"Collecting pyppeteer (from requests_html)\n", | |
" Downloading pyppeteer-0.0.12.tar.gz (70kB)\n", | |
"\u001b[K 100% |████████████████████████████████| 71kB 4.9MB/s eta 0:00:01\n", | |
"\u001b[?25hCollecting fake-useragent (from requests_html)\n", | |
" Downloading fake-useragent-0.1.10.tar.gz\n", | |
"Collecting bs4 (from requests_html)\n", | |
" Downloading bs4-0.0.1.tar.gz\n", | |
"Requirement already satisfied: requests in /opt/conda/lib/python3.6/site-packages (from requests_html)\n", | |
"Requirement already satisfied: parse in /opt/conda/lib/python3.6/site-packages (from requests_html)\n", | |
"Collecting pyquery (from requests_html)\n", | |
" Downloading pyquery-1.4.0-py2.py3-none-any.whl\n", | |
"Requirement already satisfied: six>=1.4.1 in /opt/conda/lib/python3.6/site-packages (from w3lib->requests_html)\n", | |
"Collecting pyee (from pyppeteer->requests_html)\n", | |
" Downloading pyee-5.0.0-py2.py3-none-any.whl\n", | |
"Collecting websockets (from pyppeteer->requests_html)\n", | |
" Downloading websockets-4.0.1-cp36-cp36m-manylinux1_x86_64.whl (81kB)\n", | |
"\u001b[K 100% |████████████████████████████████| 81kB 5.0MB/s eta 0:00:01\n", | |
"\u001b[?25hRequirement already satisfied: beautifulsoup4 in /opt/conda/lib/python3.6/site-packages (from bs4->requests_html)\n", | |
"Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /opt/conda/lib/python3.6/site-packages (from requests->requests_html)\n", | |
"Requirement already satisfied: idna<2.7,>=2.5 in /opt/conda/lib/python3.6/site-packages (from requests->requests_html)\n", | |
"Requirement already satisfied: urllib3<1.23,>=1.21.1 in /opt/conda/lib/python3.6/site-packages (from requests->requests_html)\n", | |
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.6/site-packages (from requests->requests_html)\n", | |
"Collecting cssselect>0.7.9 (from pyquery->requests_html)\n", | |
" Downloading cssselect-1.0.3-py2.py3-none-any.whl\n", | |
"Collecting lxml>=2.1 (from pyquery->requests_html)\n", | |
" Downloading lxml-4.1.1-cp36-cp36m-manylinux1_x86_64.whl (5.6MB)\n", | |
"\u001b[K 100% |████████████████████████████████| 5.6MB 103kB/s eta 0:00:01\n", | |
"\u001b[?25hBuilding wheels for collected packages: pyppeteer, fake-useragent, bs4\n", | |
" Running setup.py bdist_wheel for pyppeteer ... \u001b[?25ldone\n", | |
"\u001b[?25h Stored in directory: /home/jovyan/.cache/pip/wheels/36/9c/91/0c72fbe31848453f179ce667893a9094e99d9d47f368e07103\n", | |
" Running setup.py bdist_wheel for fake-useragent ... \u001b[?25ldone\n", | |
"\u001b[?25h Stored in directory: /home/jovyan/.cache/pip/wheels/07/04/1d/bbd8ba7d692add504b44552504b7df239bddf56aa3387cee2b\n", | |
" Running setup.py bdist_wheel for bs4 ... \u001b[?25ldone\n", | |
"\u001b[?25h Stored in directory: /home/jovyan/.cache/pip/wheels/84/67/d4/9e09d9d5adede2ee1c7b7e8775ba3fbb04d07c4f946f0e4f11\n", | |
"Successfully built pyppeteer fake-useragent bs4\n", | |
"Installing collected packages: w3lib, pyee, websockets, pyppeteer, fake-useragent, bs4, cssselect, lxml, pyquery, requests-html\n", | |
"Successfully installed bs4-0.0.1 cssselect-1.0.3 fake-useragent-0.1.10 lxml-4.1.1 pyee-5.0.0 pyppeteer-0.0.12 pyquery-1.4.0 requests-html-0.7.2 w3lib-1.19.0 websockets-4.0.1\n" | |
] | |
} | |
], | |
"source": [ | |
"#https://github.com/kennethreitz/requests-html\n", | |
"!pip install requests_html" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from parse import *" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#https://publications.parliament.uk/pa/cm/cmregmem/170502/dugher_michael.htm" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from requests_html import HTMLSession\n", | |
"session = HTMLSession()\n", | |
"\n", | |
"url='https://publications.parliament.uk/pa/cm/cmregmem/170502/dugher_michael.htm'\n", | |
"r = session.get(url)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 89, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"--------\n", | |
"--------\n", | |
"--------\n", | |
"--------\n", | |
"--------\n", | |
"--------\n", | |
"<Result () {'name': 'VGC Group', 'addr': 'Cardinal House, Bury Street, Ruislip HA4 7GD', 'am': '£1,800 in a successful auction bid at a fundraising dinner for Barnsley East CLP and the office of another MP, the profits from which will be divided equally.', 'status': 'company, registration 5741473', 'date': '04 May 2016'}>\n", | |
"<Result ('£', ' in a successful auction bid at a fundraising dinner for Barnsley East CLP and the office of another MP, the profits from which will be divided equally.') {'num': 1800}>\n", | |
"--------\n", | |
"<Result () {'name': 'Edward Maurice Watkins', 'addr': 'private', 'am': '£755 in purchasing tickets for a fundraising dinner for Barnsley East CLP and the office of another MP, the profits from which will be divided equally. (Registered with donation below.)', 'status': 'individual', 'date': '23 May 2016'}>\n", | |
"<Result ('£', ' in purchasing tickets for a fundraising dinner for Barnsley East CLP and the office of another MP, the profits from which will be divided equally. (Registered with donation below.)') {'num': 755}>\n", | |
"--------\n", | |
"<Result () {'name': 'Edward Maurice Watkins', 'addr': 'private', 'am': '£2,000 in a successful auction bid at a fundraising dinner for Barnsley East CLP and the office of another MP, the profits from which will be divided equally. (Registered with donation above.)', 'status': 'individual', 'date': '23 May 2016'}>\n", | |
"<Result ('£', ' in a successful auction bid at a fundraising dinner for Barnsley East CLP and the office of another MP, the profits from which will be divided equally. (Registered with donation above.)') {'num': 2000}>\n", | |
"--------\n", | |
"<Result () {'name': 'Chris Chenn', 'addr': 'private', 'am': '£1,900 in a successful auction bid at a fundraising dinner for Barnsley East CLP and the office of another MP, the profits from which will be divided equally.', 'status': 'individual', 'date': '22 September 2016'}>\n", | |
"<Result ('£', ' in a successful auction bid at a fundraising dinner for Barnsley East CLP and the office of another MP, the profits from which will be divided equally.') {'num': 1900}>\n", | |
"--------\n", | |
"--------\n", | |
"<Result () {'name': 'Balmoral Tanks Ltd', 'addr': 'Balmoral Park, Aberdeen AB12 3GY', 'am': '£2,000 to support my Primary School Christmas Card Competition\\nDate received: 8 December 2016\\nDate accepted: 8 December 2016', 'status': 'company, registration 300656', 'date': '09 December 2016'}>\n", | |
"<Result ('£', ' to support my Primary School Christmas Card Competition\\nDate received: 8 December 2016\\nDate accepted: 8 December 2016') {'num': 2000}>\n", | |
"--------\n", | |
"--------\n", | |
"<Result () {'name': 'UK Music', 'addr': '4th Floor, 49 Whitehall, London SW1A 2BX', 'am': 'ticket and hospitality at the Ivor Novello Awards, value £444\\nDate received: 19 May 2016\\nDate accepted: 19 May 2016', 'status': 'company, registration no 3245288', 'date': '27 May 2016'}>\n", | |
"<Result ('ticket and hospitality at the Ivor Novello Awards, value £', '\\nDate received: 19 May 2016\\nDate accepted: 19 May 2016') {'num': 444}>\n", | |
"--------\n", | |
"<Result () {'name': 'The Yorkshire County Cricket Club', 'addr': 'Headingley Cricket Ground, Leeds LS6 3DP', 'am': 'two tickets and hospitality at Yorkshire County Cricket Club to the value of £200, a total of £400\\nDate received: 21 May 2016\\nDate accepted: 21 May 2016', 'status': 'company, registration IP28929R', 'date': '07 June 2016'}>\n", | |
"<Result ('two tickets and hospitality at Yorkshire County Cricket Club to the value of £', ', a total of £400\\nDate received: 21 May 2016\\nDate accepted: 21 May 2016') {'num': 200}>\n", | |
"--------\n", | |
"<Result () {'name': 'Football Association Premier League', 'addr': '30 Gloucester Place, London W1U 8PL', 'am': 'Ticket and hospitality for a concert at Wembley Stadium, value £259\\nDate received: 5 June 2016\\nDate accepted: 5 June 2016', 'status': 'company, registration no 2719699', 'date': '13 June 2016'}>\n", | |
"<Result ('Ticket and hospitality for a concert at Wembley Stadium, value £', '\\nDate received: 5 June 2016\\nDate accepted: 5 June 2016') {'num': 259}>\n", | |
"--------\n", | |
"<Result () {'name': 'Hampshire Cricket Ltd', 'addr': 'Botley Road, West End, Southampton SO30 3XH', 'am': 'two tickets, and accompanying hospitality, to watch cricket at the Ageas Bowl, value for each person £499 plus VAT; £1,197.60 in total\\nDate received: 5 July 2016\\nDate accepted: 5 July 2016', 'status': 'company, registration no 4343355', 'date': '08 July 2016; updated 13 July 2016'}>\n", | |
"<Result ('two tickets, and accompanying hospitality, to watch cricket at the Ageas Bowl, value for each person £', ' plus VAT; £1,197.60 in total\\nDate received: 5 July 2016\\nDate accepted: 5 July 2016') {'num': 499}>\n", | |
"--------\n", | |
"<Result () {'name': 'UK Music', 'addr': '4th Floor, 49 Whitehall, London SW1A 2BX', 'am': 'four tickets to attend a music concert at Wembley Stadium, value £380\\nDate received: 10 September 2016\\nDate accepted: 10 September 2016', 'status': 'company, registration 3245288', 'date': '20 September 2016'}>\n", | |
"<Result ('four tickets to attend a music concert at Wembley Stadium, value £', '\\nDate received: 10 September 2016\\nDate accepted: 10 September 2016') {'num': 380}>\n", | |
"--------\n", | |
"<Result () {'name': 'The Football Association Premier League Ltd', 'addr': '30 Gloucester Place, London W1U 8PL', 'am': 'a ticket and hospitality to watch a football match at the Etihad Stadium, value £268\\nDate received: 6 December 2016\\nDate accepted: 6 December 2016', 'status': 'company, registration 2719699', 'date': '19 December 2016'}>\n", | |
"<Result ('a ticket and hospitality to watch a football match at the Etihad Stadium, value £', '\\nDate received: 6 December 2016\\nDate accepted: 6 December 2016') {'num': 268}>\n", | |
"--------\n", | |
"<Result () {'name': 'William Hill PLC', 'addr': 'Greenside House, 50 Station Road, Wood Green, London N22 7TP', 'am': 'three tickets to attend the World Darts Final, total value £750\\nDate received: 2 January 2017\\nDate accepted: 2 January 2017', 'status': 'company, registration 4212563', 'date': '26 January 2017'}>\n", | |
"<Result ('three tickets to attend the World Darts Final, total value £', '\\nDate received: 2 January 2017\\nDate accepted: 2 January 2017') {'num': 750}>\n", | |
"--------\n", | |
"--------\n", | |
"--------\n", | |
"--------\n", | |
"--------\n", | |
"--------\n", | |
"--------\n", | |
"--------\n", | |
"--------\n", | |
"--------\n", | |
"--------\n", | |
"--------\n" | |
] | |
} | |
], | |
"source": [ | |
"for p in r.html.find('#mainTextBlock > p'):\n", | |
" #print(p.text)\n", | |
"\n", | |
" pr = parse('''Name of donor: {name}\\nAddress of donor: {addr}\\nAmount of donation or nature and value if donation in kind: {am}\\nDonor status: {status}\\n(Registered {date})''',p.text)\n", | |
" if pr :\n", | |
" print(pr)\n", | |
" print(parse('{?}£{num:n}{}',pr['am']))\n", | |
" print('--------')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 64, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<Result () {'n': '5,000'}>\n" | |
] | |
} | |
], | |
"source": [ | |
"print(parse('£{n}',\"£5,000\"))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment