Skip to content

Instantly share code, notes, and snippets.

@s2tephen

s2tephen/scrape.py

Created May 18, 2014
Embed
What would you like to do?
Cross-referencing MS clinical trials as listed on https://neuinfo.org and medical papers on http://www.ncbi.nlm.nih.gov/pubmed
import os, csv, urllib2, re, mechanize
from BeautifulSoup import BeautifulSoup
# initialize variables
rows = [];
INPUT_FILE = 'combined.csv'
OUTPUT_FILE = 'new.csv'
# get all clinical trial ids, store rows for later
if os.path.isfile(INPUT_FILE):
input_file = csv.DictReader(open(INPUT_FILE))
for r in input_file:
rows.append(r)
# setup browser
br = mechanize.Browser()
br.addheaders = [('User-agent','Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))')]
br.set_handle_robots(False)
# send request and soupify
print 'scraping...'
for row in rows:
print(' ' + row['id'] + '...'),
results = 0
br.open('http://www.ncbi.nlm.nih.gov/pubmed/?term=' + row['id'] + '+AND+(Clinical+Trial%5Bptyp%5D)')
soup = BeautifulSoup(br.response().read())
if soup.findAll('span', 'icon'): # no results
row['pubmed_urls'] = '';
elif len(soup.findAll('h1')) == 2: # single result
results = 1
row['pubmed_urls'] = 'http://www.ncbi.nlm.nih.gov/pubmed/' + br.geturl().split('=')[1].split('+')[0];
else: # multiple results
results = int(soup.find('h2','result_count').text.split(' ')[-1])
urls = ''
for p in soup.findAll('p', 'title'):
urls += ',http://www.ncbi.nlm.nih.gov' + p.find('a').get('href')
row['pubmed_urls'] = urls[1:]
row['results'] = results
print ' done!'
# output to csv
field_names = rows[0].keys()
writer = csv.DictWriter(open(OUTPUT_FILE, 'wb'), fieldnames=field_names)
headers = dict((n, n) for n in field_names)
writer.writerow(headers)
for row in rows:
writer.writerow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment