Skip to content

Instantly share code, notes, and snippets.

@psychemedia
Last active March 2, 2018 15:26
Show Gist options
  • Save psychemedia/4094626f3b20d8d629381feb673c9c48 to your computer and use it in GitHub Desktop.
Save psychemedia/4094626f3b20d8d629381feb673c9c48 to your computer and use it in GitHub Desktop.
Quick demo of crude scrape of register of MP interests
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting parse\n",
" Downloading parse-1.8.2.tar.gz\n",
"Building wheels for collected packages: parse\n",
" Running setup.py bdist_wheel for parse ... \u001b[?25ldone\n",
"\u001b[?25h Stored in directory: /home/jovyan/.cache/pip/wheels/b0/3b/37/4ab694a0d331a3cc487923ff9c3645d0f103e9cd4762065f77\n",
"Successfully built parse\n",
"Installing collected packages: parse\n",
"Successfully installed parse-1.8.2\n"
]
}
],
"source": [
"#https://github.com/r1chardj0n3s/parse\n",
"!pip install parse"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting requests_html\n",
" Downloading requests_html-0.7.2-py2.py3-none-any.whl\n",
"Collecting w3lib (from requests_html)\n",
" Downloading w3lib-1.19.0-py2.py3-none-any.whl\n",
"Collecting pyppeteer (from requests_html)\n",
" Downloading pyppeteer-0.0.12.tar.gz (70kB)\n",
"\u001b[K 100% |████████████████████████████████| 71kB 4.9MB/s eta 0:00:01\n",
"\u001b[?25hCollecting fake-useragent (from requests_html)\n",
" Downloading fake-useragent-0.1.10.tar.gz\n",
"Collecting bs4 (from requests_html)\n",
" Downloading bs4-0.0.1.tar.gz\n",
"Requirement already satisfied: requests in /opt/conda/lib/python3.6/site-packages (from requests_html)\n",
"Requirement already satisfied: parse in /opt/conda/lib/python3.6/site-packages (from requests_html)\n",
"Collecting pyquery (from requests_html)\n",
" Downloading pyquery-1.4.0-py2.py3-none-any.whl\n",
"Requirement already satisfied: six>=1.4.1 in /opt/conda/lib/python3.6/site-packages (from w3lib->requests_html)\n",
"Collecting pyee (from pyppeteer->requests_html)\n",
" Downloading pyee-5.0.0-py2.py3-none-any.whl\n",
"Collecting websockets (from pyppeteer->requests_html)\n",
" Downloading websockets-4.0.1-cp36-cp36m-manylinux1_x86_64.whl (81kB)\n",
"\u001b[K 100% |████████████████████████████████| 81kB 5.0MB/s eta 0:00:01\n",
"\u001b[?25hRequirement already satisfied: beautifulsoup4 in /opt/conda/lib/python3.6/site-packages (from bs4->requests_html)\n",
"Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /opt/conda/lib/python3.6/site-packages (from requests->requests_html)\n",
"Requirement already satisfied: idna<2.7,>=2.5 in /opt/conda/lib/python3.6/site-packages (from requests->requests_html)\n",
"Requirement already satisfied: urllib3<1.23,>=1.21.1 in /opt/conda/lib/python3.6/site-packages (from requests->requests_html)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.6/site-packages (from requests->requests_html)\n",
"Collecting cssselect>0.7.9 (from pyquery->requests_html)\n",
" Downloading cssselect-1.0.3-py2.py3-none-any.whl\n",
"Collecting lxml>=2.1 (from pyquery->requests_html)\n",
" Downloading lxml-4.1.1-cp36-cp36m-manylinux1_x86_64.whl (5.6MB)\n",
"\u001b[K 100% |████████████████████████████████| 5.6MB 103kB/s eta 0:00:01\n",
"\u001b[?25hBuilding wheels for collected packages: pyppeteer, fake-useragent, bs4\n",
" Running setup.py bdist_wheel for pyppeteer ... \u001b[?25ldone\n",
"\u001b[?25h Stored in directory: /home/jovyan/.cache/pip/wheels/36/9c/91/0c72fbe31848453f179ce667893a9094e99d9d47f368e07103\n",
" Running setup.py bdist_wheel for fake-useragent ... \u001b[?25ldone\n",
"\u001b[?25h Stored in directory: /home/jovyan/.cache/pip/wheels/07/04/1d/bbd8ba7d692add504b44552504b7df239bddf56aa3387cee2b\n",
" Running setup.py bdist_wheel for bs4 ... \u001b[?25ldone\n",
"\u001b[?25h Stored in directory: /home/jovyan/.cache/pip/wheels/84/67/d4/9e09d9d5adede2ee1c7b7e8775ba3fbb04d07c4f946f0e4f11\n",
"Successfully built pyppeteer fake-useragent bs4\n",
"Installing collected packages: w3lib, pyee, websockets, pyppeteer, fake-useragent, bs4, cssselect, lxml, pyquery, requests-html\n",
"Successfully installed bs4-0.0.1 cssselect-1.0.3 fake-useragent-0.1.10 lxml-4.1.1 pyee-5.0.0 pyppeteer-0.0.12 pyquery-1.4.0 requests-html-0.7.2 w3lib-1.19.0 websockets-4.0.1\n"
]
}
],
"source": [
"#https://github.com/kennethreitz/requests-html\n",
"!pip install requests_html"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"from parse import *"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#https://publications.parliament.uk/pa/cm/cmregmem/170502/dugher_michael.htm"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"from requests_html import HTMLSession\n",
"session = HTMLSession()\n",
"\n",
"url='https://publications.parliament.uk/pa/cm/cmregmem/170502/dugher_michael.htm'\n",
"r = session.get(url)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--------\n",
"--------\n",
"--------\n",
"--------\n",
"--------\n",
"--------\n",
"<Result () {'name': 'VGC Group', 'addr': 'Cardinal House, Bury Street, Ruislip HA4 7GD', 'am': '£1,800 in a successful auction bid at a fundraising dinner for Barnsley East CLP and the office of another MP, the profits from which will be divided equally.', 'status': 'company, registration 5741473', 'date': '04 May 2016'}>\n",
"<Result ('£', ' in a successful auction bid at a fundraising dinner for Barnsley East CLP and the office of another MP, the profits from which will be divided equally.') {'num': 1800}>\n",
"--------\n",
"<Result () {'name': 'Edward Maurice Watkins', 'addr': 'private', 'am': '£755 in purchasing tickets for a fundraising dinner for Barnsley East CLP and the office of another MP, the profits from which will be divided equally. (Registered with donation below.)', 'status': 'individual', 'date': '23 May 2016'}>\n",
"<Result ('£', ' in purchasing tickets for a fundraising dinner for Barnsley East CLP and the office of another MP, the profits from which will be divided equally. (Registered with donation below.)') {'num': 755}>\n",
"--------\n",
"<Result () {'name': 'Edward Maurice Watkins', 'addr': 'private', 'am': '£2,000 in a successful auction bid at a fundraising dinner for Barnsley East CLP and the office of another MP, the profits from which will be divided equally. (Registered with donation above.)', 'status': 'individual', 'date': '23 May 2016'}>\n",
"<Result ('£', ' in a successful auction bid at a fundraising dinner for Barnsley East CLP and the office of another MP, the profits from which will be divided equally. (Registered with donation above.)') {'num': 2000}>\n",
"--------\n",
"<Result () {'name': 'Chris Chenn', 'addr': 'private', 'am': '£1,900 in a successful auction bid at a fundraising dinner for Barnsley East CLP and the office of another MP, the profits from which will be divided equally.', 'status': 'individual', 'date': '22 September 2016'}>\n",
"<Result ('£', ' in a successful auction bid at a fundraising dinner for Barnsley East CLP and the office of another MP, the profits from which will be divided equally.') {'num': 1900}>\n",
"--------\n",
"--------\n",
"<Result () {'name': 'Balmoral Tanks Ltd', 'addr': 'Balmoral Park, Aberdeen AB12 3GY', 'am': '£2,000 to support my Primary School Christmas Card Competition\\nDate received: 8 December 2016\\nDate accepted: 8 December 2016', 'status': 'company, registration 300656', 'date': '09 December 2016'}>\n",
"<Result ('£', ' to support my Primary School Christmas Card Competition\\nDate received: 8 December 2016\\nDate accepted: 8 December 2016') {'num': 2000}>\n",
"--------\n",
"--------\n",
"<Result () {'name': 'UK Music', 'addr': '4th Floor, 49 Whitehall, London SW1A 2BX', 'am': 'ticket and hospitality at the Ivor Novello Awards, value £444\\nDate received: 19 May 2016\\nDate accepted: 19 May 2016', 'status': 'company, registration no 3245288', 'date': '27 May 2016'}>\n",
"<Result ('ticket and hospitality at the Ivor Novello Awards, value £', '\\nDate received: 19 May 2016\\nDate accepted: 19 May 2016') {'num': 444}>\n",
"--------\n",
"<Result () {'name': 'The Yorkshire County Cricket Club', 'addr': 'Headingley Cricket Ground, Leeds LS6 3DP', 'am': 'two tickets and hospitality at Yorkshire County Cricket Club to the value of £200, a total of £400\\nDate received: 21 May 2016\\nDate accepted: 21 May 2016', 'status': 'company, registration IP28929R', 'date': '07 June 2016'}>\n",
"<Result ('two tickets and hospitality at Yorkshire County Cricket Club to the value of £', ', a total of £400\\nDate received: 21 May 2016\\nDate accepted: 21 May 2016') {'num': 200}>\n",
"--------\n",
"<Result () {'name': 'Football Association Premier League', 'addr': '30 Gloucester Place, London W1U 8PL', 'am': 'Ticket and hospitality for a concert at Wembley Stadium, value £259\\nDate received: 5 June 2016\\nDate accepted: 5 June 2016', 'status': 'company, registration no 2719699', 'date': '13 June 2016'}>\n",
"<Result ('Ticket and hospitality for a concert at Wembley Stadium, value £', '\\nDate received: 5 June 2016\\nDate accepted: 5 June 2016') {'num': 259}>\n",
"--------\n",
"<Result () {'name': 'Hampshire Cricket Ltd', 'addr': 'Botley Road, West End, Southampton SO30 3XH', 'am': 'two tickets, and accompanying hospitality, to watch cricket at the Ageas Bowl, value for each person £499 plus VAT; £1,197.60 in total\\nDate received: 5 July 2016\\nDate accepted: 5 July 2016', 'status': 'company, registration no 4343355', 'date': '08 July 2016; updated 13 July 2016'}>\n",
"<Result ('two tickets, and accompanying hospitality, to watch cricket at the Ageas Bowl, value for each person £', ' plus VAT; £1,197.60 in total\\nDate received: 5 July 2016\\nDate accepted: 5 July 2016') {'num': 499}>\n",
"--------\n",
"<Result () {'name': 'UK Music', 'addr': '4th Floor, 49 Whitehall, London SW1A 2BX', 'am': 'four tickets to attend a music concert at Wembley Stadium, value £380\\nDate received: 10 September 2016\\nDate accepted: 10 September 2016', 'status': 'company, registration 3245288', 'date': '20 September 2016'}>\n",
"<Result ('four tickets to attend a music concert at Wembley Stadium, value £', '\\nDate received: 10 September 2016\\nDate accepted: 10 September 2016') {'num': 380}>\n",
"--------\n",
"<Result () {'name': 'The Football Association Premier League Ltd', 'addr': '30 Gloucester Place, London W1U 8PL', 'am': 'a ticket and hospitality to watch a football match at the Etihad Stadium, value £268\\nDate received: 6 December 2016\\nDate accepted: 6 December 2016', 'status': 'company, registration 2719699', 'date': '19 December 2016'}>\n",
"<Result ('a ticket and hospitality to watch a football match at the Etihad Stadium, value £', '\\nDate received: 6 December 2016\\nDate accepted: 6 December 2016') {'num': 268}>\n",
"--------\n",
"<Result () {'name': 'William Hill PLC', 'addr': 'Greenside House, 50 Station Road, Wood Green, London N22 7TP', 'am': 'three tickets to attend the World Darts Final, total value £750\\nDate received: 2 January 2017\\nDate accepted: 2 January 2017', 'status': 'company, registration 4212563', 'date': '26 January 2017'}>\n",
"<Result ('three tickets to attend the World Darts Final, total value £', '\\nDate received: 2 January 2017\\nDate accepted: 2 January 2017') {'num': 750}>\n",
"--------\n",
"--------\n",
"--------\n",
"--------\n",
"--------\n",
"--------\n",
"--------\n",
"--------\n",
"--------\n",
"--------\n",
"--------\n",
"--------\n"
]
}
],
"source": [
"for p in r.html.find('#mainTextBlock > p'):\n",
" #print(p.text)\n",
"\n",
" pr = parse('''Name of donor: {name}\\nAddress of donor: {addr}\\nAmount of donation or nature and value if donation in kind: {am}\\nDonor status: {status}\\n(Registered {date})''',p.text)\n",
" if pr :\n",
" print(pr)\n",
" print(parse('{?}£{num:n}{}',pr['am']))\n",
" print('--------')"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<Result () {'n': '5,000'}>\n"
]
}
],
"source": [
"print(parse('£{n}',\"£5,000\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment