Skip to content

Instantly share code, notes, and snippets.

@alecxe
Created March 10, 2014 06:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alecxe/9460230 to your computer and use it in GitHub Desktop.
Save alecxe/9460230 to your computer and use it in GitHub Desktop.
import csv
import urllib2
from bs4 import BeautifulSoup
def record(part):
soup = BeautifulSoup(urllib2.urlopen("http://www.admision.unmsm.edu.pe/admisionsabado".format(part)))
links = [link.text for link in soup.find('table').find_all('a')[2:]]
t = (len(links)) / 2
part = part[:-6]
name = part.replace("/", "")
with open('{}.csv'.format(name), 'wb') as f:
writer = csv.writer(f)
for i in range(t):
url = "http://www.admision.unmsm.edu.pe/admisionsabado{}{}.html".format(part, i)
soup = BeautifulSoup(urllib2.urlopen(url))
for tr in soup.find_all('tr')[1:]:
tds = tr.find_all('td')
row = [elem.text.encode('utf-8') for elem in tds[:6]]
writer.writerow(row)
soup = BeautifulSoup(urllib2.urlopen("http://www.admision.unmsm.edu.pe/admisionsabado/A.html"))
links = [tr.a["href"].replace(".", "", 1) for tr in soup.find_all('tr')]
for link in links:
record(link)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment