Skip to content

Instantly share code, notes, and snippets.

@psthomas
Last active December 22, 2016 20:49
Embed
What would you like to do?
Code for scraping and exploring data from the IGM Experts Forum. http://pstblog.com/2016/08/16/explore-igmforum
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# using environment datascience
# installed requests and bs4
# datascience /Users/psthomas/miniconda2/envs/datascience
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import traceback
def get_data(url):
r = requests.get(url) #Encoding is UTF-8
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
bio = soup.find('div', attrs={'class':'bioHeader'})
name_cont = bio.find('h2')
name = name_cont.get_text(strip=True)
institution = name_cont.next_sibling.strip()
homepage = bio.find('a')['href']
titles = soup.find_all(class_="surveyTitle") #class_ not reserved
row_list = []
for title in titles:
qtitle = title.get_text()
current = title
while current.next_sibling.next_sibling.name == 'h3':
h3 = current.next_sibling.next_sibling
table = h3.next_sibling.next_sibling
qtext = h3.get_text(strip=True).replace('\n', ' ')
data = table.find_all('td')
vote = data[0].get_text(strip=True) # TODO: handle case of ' ' return?
confidence = int(data[1].get_text(strip=True)) if data[1].get_text(strip=True) else None
comments = data[2].get_text(strip=True).replace('\n', ' ')
median_vote = data[3].get_text(strip=True)
median_conf = int(data[4].get_text(strip=True)) if data[4].get_text(strip=True) else None
#Get subquestion
if len(qtext.split(':')) > 1:
parts = qtext.split(':')
qtext = parts[1]
subquestion = parts[0]
else:
subquestion = 'Question A'
#Handle newcomers who voted later, ids 42-51
if vote == '---':
table = table.next_sibling.next_sibling
data = table.find_all('td')
vote = data[0].get_text(strip=True)
confidence = int(data[1].get_text(strip=True)) if data[1].get_text(strip=True) else None
comments = data[2].get_text(strip=True).replace('\n', ' ')
row = {'name': name, 'institution': institution, 'homepage': homepage, 'url': url, 'qtitle': qtitle,
'subquestion': subquestion, 'qtext': qtext, 'vote': vote, 'confidence': confidence,
'comments': comments, 'median_vote': median_vote, 'median_conf': median_conf }
row_list.append(row)
current = table
#Reach end of page
if current.next_sibling.next_sibling == None:
break
return row_list
else:
print "Failed to access questions url: " + url
return False
def cycle_pages(base_url, id_list, out_name):
columns = ['name', 'institution', 'url', 'homepage', 'qtitle','subquestion','qtext',
'vote', 'confidence', 'comments', 'median_vote', 'median_conf']
master_df = pd.DataFrame(columns=columns)
for i in id_list: #range(1,52)
url = base_url + str(i)
try:
row_list = get_data(url)
temp_df = pd.DataFrame(row_list)
master_df = pd.concat([master_df, temp_df])
except Exception as e:
print 'Exception for url: ' + url + '\n' + str(e)
print traceback.format_exc()
else:
print 'Processed url: ' + url
time.sleep(0.1) # Give the server a break
print master_df.head()
print master_df.tail()
print master_df.describe(include='all')
#Output all
master_df.to_csv(out_name + '.csv', encoding='utf-8', index=False, columns=columns)
############################################################################
# Run Scripts
# Acemoglu has participated in all questions:
#questions_url = 'http://www.igmchicago.org/igm-economic-experts-panel/participant-bio-2?id=1'
#get_data(questions_url)
# Iterate through id 1 to 51:
base_url = 'http://www.igmchicago.org/igm-economic-experts-panel/participant-bio-2?id='
# Run all
id_list = range(1,52)
out_name = 'output_all'
cycle_pages(base_url, id_list, out_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment