elidickinson/scrapbio.py

## scrapbio.py
# First: wget --random-wait --wait=2 --limit-rate=100K http://e.mybio.zerista.com/exhibitor?exhibitor_page={1..142}
# But that saves them with a stupid name, so rename using:
# ls -d exhib* | sed 's/\(.*\)=\(.*\)$/mv "&" "bio\2.html"/' | sh
#
from BeautifulSoup import BeautifulSoup
import csv
import glob
import os

writer = csv.writer(open("bio.csv", "wb"))

for infile in glob.glob('bio*.html'):
    soup = BeautifulSoup(open(infile))
    cells = soup.findAll('td','about')
    for cell in cells:
	#print cell.name
	exhibitor = cell.find('a').string.strip()
	booth = cell.find('p','exhibitor_map_link')
	tags = cell.find('span','tag_links')
	if tags:
	    taglinks = tags.findAll('a')
	    tags = ", ".join([t.string.strip() for t in taglinks])
	if booth:
	    booth = booth.string.strip()
	row = [exhibitor,booth,tags]
	row = [unicode(x).encode('UTF-8','ignore') for x in row]
	print row
	writer.writerow(row)
	# First: wget --random-wait --wait=2 --limit-rate=100K http://e.mybio.zerista.com/exhibitor?exhibitor_page={1..142}
	# But that saves them with a stupid name, so rename using:
	# ls -d exhib* \| sed 's/\(.\)=\(.\)$/mv "&" "bio\2.html"/' \| sh
	#
	from BeautifulSoup import BeautifulSoup
	import csv
	import glob
	import os

	writer = csv.writer(open("bio.csv", "wb"))

	for infile in glob.glob('bio*.html'):
	soup = BeautifulSoup(open(infile))
	cells = soup.findAll('td','about')
	for cell in cells:
	#print cell.name
	exhibitor = cell.find('a').string.strip()
	booth = cell.find('p','exhibitor_map_link')
	tags = cell.find('span','tag_links')
	if tags:
	taglinks = tags.findAll('a')
	tags = ", ".join([t.string.strip() for t in taglinks])
	if booth:
	booth = booth.string.strip()
	row = [exhibitor,booth,tags]
	row = [unicode(x).encode('UTF-8','ignore') for x in row]
	print row
	writer.writerow(row)