Skip to content

Instantly share code, notes, and snippets.

@tapionx
Last active July 5, 2018 08:57
Show Gist options
  • Save tapionx/c9e173a52c667e633444a5ce7f0563bb to your computer and use it in GitHub Desktop.
Save tapionx/c9e173a52c667e633444a5ce7f0563bb to your computer and use it in GitHub Desktop.
scrape email addresses for all the italian euro-deputies (https://changecopyright.org/it)
# requirements: pip install requests-html
import json
from requests_html import HTMLSession
import requests
desired_country_code = 'IT'
session = HTMLSession()
cookies = {
'__verify': '1',
'europarl_cookies_accepted': '1',
'ROUTEID': '.node1',
'HTTPD_ROUTEID': '.node1',
}
headers = {
'Pragma': 'no-cache',
'Origin': 'http://www.europarl.europa.eu',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Cache-Control': 'no-cache',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
'Referer': 'http://www.europarl.europa.eu/meps/it/search.html',
'DNT': '1',
}
data = [
('mepId', ''),
('country', desired_country_code),
('countryCircons', ''),
('politicalGroup', ''),
('bodyType', 'ALL'),
('bodyValue', ''),
]
response = requests.post('http://www.europarl.europa.eu/meps/it/json/getDistricts.html', headers=headers, cookies=cookies, data=data)
deputies = response.json()
for deputy in deputies['result']:
url = 'http://www.europarl.europa.eu' + deputy['detailUrl']
r = session.get(url)
email_tag = r.html.xpath('//*[@class="link_email"]', first=True)
if email_tag:
email = email_tag.attrs['href'].replace('mailto:','').replace('[dot]', '.').replace('[at]','@')[::-1]
print(email)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment