Skip to content

Instantly share code, notes, and snippets.

@psthomas
Last active December 22, 2016 20:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save psthomas/663b75d178eeb0e6bc0aff69e0ad7208 to your computer and use it in GitHub Desktop.
Save psthomas/663b75d178eeb0e6bc0aff69e0ad7208 to your computer and use it in GitHub Desktop.
Code for scraping and exploring data from the IGM Experts Forum. http://pstblog.com/2016/08/16/explore-igmforum
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# using environment datascience
# installed requests and bs4
# datascience /Users/psthomas/miniconda2/envs/datascience
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import traceback
def get_data(url):
r = requests.get(url) #Encoding is UTF-8
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
bio = soup.find('div', attrs={'class':'bioHeader'})
name_cont = bio.find('h2')
name = name_cont.get_text(strip=True)
institution = name_cont.next_sibling.strip()
homepage = bio.find('a')['href']
titles = soup.find_all(class_="surveyTitle") #class_ not reserved
row_list = []
for title in titles:
qtitle = title.get_text()
current = title
while current.next_sibling.next_sibling.name == 'h3':
h3 = current.next_sibling.next_sibling
table = h3.next_sibling.next_sibling
qtext = h3.get_text(strip=True).replace('\n', ' ')
data = table.find_all('td')
vote = data[0].get_text(strip=True) # TODO: handle case of ' ' return?
confidence = int(data[1].get_text(strip=True)) if data[1].get_text(strip=True) else None
comments = data[2].get_text(strip=True).replace('\n', ' ')
median_vote = data[3].get_text(strip=True)
median_conf = int(data[4].get_text(strip=True)) if data[4].get_text(strip=True) else None
#Get subquestion
if len(qtext.split(':')) > 1:
parts = qtext.split(':')
qtext = parts[1]
subquestion = parts[0]
else:
subquestion = 'Question A'
#Handle newcomers who voted later, ids 42-51
if vote == '---':
table = table.next_sibling.next_sibling
data = table.find_all('td')
vote = data[0].get_text(strip=True)
confidence = int(data[1].get_text(strip=True)) if data[1].get_text(strip=True) else None
comments = data[2].get_text(strip=True).replace('\n', ' ')
row = {'name': name, 'institution': institution, 'homepage': homepage, 'url': url, 'qtitle': qtitle,
'subquestion': subquestion, 'qtext': qtext, 'vote': vote, 'confidence': confidence,
'comments': comments, 'median_vote': median_vote, 'median_conf': median_conf }
row_list.append(row)
current = table
#Reach end of page
if current.next_sibling.next_sibling == None:
break
return row_list
else:
print "Failed to access questions url: " + url
return False
def cycle_pages(base_url, id_list, out_name):
columns = ['name', 'institution', 'url', 'homepage', 'qtitle','subquestion','qtext',
'vote', 'confidence', 'comments', 'median_vote', 'median_conf']
master_df = pd.DataFrame(columns=columns)
for i in id_list: #range(1,52)
url = base_url + str(i)
try:
row_list = get_data(url)
temp_df = pd.DataFrame(row_list)
master_df = pd.concat([master_df, temp_df])
except Exception as e:
print 'Exception for url: ' + url + '\n' + str(e)
print traceback.format_exc()
else:
print 'Processed url: ' + url
time.sleep(0.1) # Give the server a break
print master_df.head()
print master_df.tail()
print master_df.describe(include='all')
#Output all
master_df.to_csv(out_name + '.csv', encoding='utf-8', index=False, columns=columns)
############################################################################
# Run Scripts
# Acemoglu has participated in all questions:
#questions_url = 'http://www.igmchicago.org/igm-economic-experts-panel/participant-bio-2?id=1'
#get_data(questions_url)
# Iterate through id 1 to 51:
base_url = 'http://www.igmchicago.org/igm-economic-experts-panel/participant-bio-2?id='
# Run all
id_list = range(1,52)
out_name = 'output_all'
cycle_pages(base_url, id_list, out_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment