Last active
December 22, 2016 20:49
-
-
Save psthomas/663b75d178eeb0e6bc0aff69e0ad7208 to your computer and use it in GitHub Desktop.
Code for scraping and exploring data from the IGM Experts Forum. http://pstblog.com/2016/08/16/explore-igmforum
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# using environment datascience | |
# installed requests and bs4 | |
# datascience /Users/psthomas/miniconda2/envs/datascience | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
import time | |
import traceback | |
def get_data(url): | |
r = requests.get(url) #Encoding is UTF-8 | |
if r.status_code == 200: | |
soup = BeautifulSoup(r.text, 'html.parser') | |
bio = soup.find('div', attrs={'class':'bioHeader'}) | |
name_cont = bio.find('h2') | |
name = name_cont.get_text(strip=True) | |
institution = name_cont.next_sibling.strip() | |
homepage = bio.find('a')['href'] | |
titles = soup.find_all(class_="surveyTitle") #class_ not reserved | |
row_list = [] | |
for title in titles: | |
qtitle = title.get_text() | |
current = title | |
while current.next_sibling.next_sibling.name == 'h3': | |
h3 = current.next_sibling.next_sibling | |
table = h3.next_sibling.next_sibling | |
qtext = h3.get_text(strip=True).replace('\n', ' ') | |
data = table.find_all('td') | |
vote = data[0].get_text(strip=True) # TODO: handle case of ' ' return? | |
confidence = int(data[1].get_text(strip=True)) if data[1].get_text(strip=True) else None | |
comments = data[2].get_text(strip=True).replace('\n', ' ') | |
median_vote = data[3].get_text(strip=True) | |
median_conf = int(data[4].get_text(strip=True)) if data[4].get_text(strip=True) else None | |
#Get subquestion | |
if len(qtext.split(':')) > 1: | |
parts = qtext.split(':') | |
qtext = parts[1] | |
subquestion = parts[0] | |
else: | |
subquestion = 'Question A' | |
#Handle newcomers who voted later, ids 42-51 | |
if vote == '---': | |
table = table.next_sibling.next_sibling | |
data = table.find_all('td') | |
vote = data[0].get_text(strip=True) | |
confidence = int(data[1].get_text(strip=True)) if data[1].get_text(strip=True) else None | |
comments = data[2].get_text(strip=True).replace('\n', ' ') | |
row = {'name': name, 'institution': institution, 'homepage': homepage, 'url': url, 'qtitle': qtitle, | |
'subquestion': subquestion, 'qtext': qtext, 'vote': vote, 'confidence': confidence, | |
'comments': comments, 'median_vote': median_vote, 'median_conf': median_conf } | |
row_list.append(row) | |
current = table | |
#Reach end of page | |
if current.next_sibling.next_sibling == None: | |
break | |
return row_list | |
else: | |
print "Failed to access questions url: " + url | |
return False | |
def cycle_pages(base_url, id_list, out_name): | |
columns = ['name', 'institution', 'url', 'homepage', 'qtitle','subquestion','qtext', | |
'vote', 'confidence', 'comments', 'median_vote', 'median_conf'] | |
master_df = pd.DataFrame(columns=columns) | |
for i in id_list: #range(1,52) | |
url = base_url + str(i) | |
try: | |
row_list = get_data(url) | |
temp_df = pd.DataFrame(row_list) | |
master_df = pd.concat([master_df, temp_df]) | |
except Exception as e: | |
print 'Exception for url: ' + url + '\n' + str(e) | |
print traceback.format_exc() | |
else: | |
print 'Processed url: ' + url | |
time.sleep(0.1) # Give the server a break | |
print master_df.head() | |
print master_df.tail() | |
print master_df.describe(include='all') | |
#Output all | |
master_df.to_csv(out_name + '.csv', encoding='utf-8', index=False, columns=columns) | |
############################################################################ | |
# Run Scripts | |
# Acemoglu has participated in all questions: | |
#questions_url = 'http://www.igmchicago.org/igm-economic-experts-panel/participant-bio-2?id=1' | |
#get_data(questions_url) | |
# Iterate through id 1 to 51: | |
base_url = 'http://www.igmchicago.org/igm-economic-experts-panel/participant-bio-2?id=' | |
# Run all | |
id_list = range(1,52) | |
out_name = 'output_all' | |
cycle_pages(base_url, id_list, out_name) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment