Skip to content

Instantly share code, notes, and snippets.

@gartenfeld
Created February 23, 2013 05:50
Show Gist options
  • Save gartenfeld/5018624 to your computer and use it in GitHub Desktop.
Save gartenfeld/5018624 to your computer and use it in GitHub Desktop.
Scraping names of participants in a class from a raw HTML dump.
from bs4 import BeautifulSoup
import re # Regular Expressions
import collections # Data Types
import sys # File operations
import codecs # UniCode support
def scrape(page):
# Dump raw HTML into Soup
raw_data = codecs.open(page, 'r', encoding='utf-8').read()
soup = BeautifulSoup(raw_data)
# Build a list of cells containing the name
lines = [line.find('a') for line in soup.find_all('td', 'cell c1')] # Specifics vary
students = []
for link in lines:
line_soup = BeautifulSoup(str(link).encode('utf-8'))
student = line_soup.get_text()
if not re.match('[\d]', student): # Exclude zombie members with numbers in their names
students.append(student)
return students
if __name__ == '__main__':
page = '/file_dir/file_name.html'
print scrape(page)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment