Last active
December 26, 2015 09:28
-
-
Save gnrfan/7129280 to your computer and use it in GitHub Desktop.
Grabs unique names from the San Marcos University (Peru) applicant listings.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
import re | |
from collections import Counter | |
url = 'http://profe-alexz.blogspot.com/2013/09/resultados-examen-admision-san-marcos-2014-i.html' | |
NAME_COUNT_LIMIT = 1000000 | |
def grab_page(url): | |
website = urllib2.urlopen(url) | |
return website.read() | |
def grab_items(regexp, source): | |
return re.findall(regexp, source) | |
def base_folder_url(url): | |
return ''.join(['/'.join(url.split('/')[:-1]), '/']) | |
def composed_link(base_url, document): | |
return ''.join([base_url, document]) | |
def grab_listing_urls(): | |
return grab_items( | |
'"(http://www.admision.unmsm.edu.pe/res20130914/.*?)"', | |
grab_page(url) | |
) | |
def grab_subpage_docs_from_listing(listing_url): | |
return grab_items('([0-9]+\.html)', grab_page(listing_url)) | |
def grab_subpage_urls_from_listing(listing_url): | |
results = [] | |
base_url = base_folder_url(listing_url) | |
documents = grab_subpage_docs_from_listing(listing_url) | |
for d in documents: | |
results.append( composed_link(base_url, d) ) | |
return results | |
def grab_names_from_subpage(subpage_url): | |
results = [] | |
aux = [ | |
item.replace('<td>', '').split(',')[1].strip() for item in grab_items( | |
'<tr>(.+)</td><td>[0-9]+</td>', | |
grab_page(subpage_url) | |
)[0].split('</td>') if ',' in item] | |
excluded = ['DE', 'DEL', 'EL', 'LA', 'LAS', 'LO', 'LOS'] | |
for item in aux: | |
filtered = [item for item in item.split(' ') if item not in excluded] | |
results.extend(filtered) | |
return results | |
def find_by_test(counter, test): | |
return sorted([item[0] for item in counter.items() if test(item)]) | |
def find_uniques(counter): | |
return find_by_test(counter, lambda item: item[1] == 1) | |
def find_frequent(counter): | |
return find_by_test(counter, lambda item: item[1] >= 5) | |
def find_infrequent(counter): | |
return find_by_test(counter, lambda item: item[1] < 5) | |
def find_rare(counter): | |
return find_by_test(counter, lambda item: item[1] <= 2) | |
name_count = Counter() | |
listings = grab_listing_urls() | |
total_count = 0 | |
abort = False | |
for n1 in listings: | |
subpages = grab_subpage_urls_from_listing(n1) | |
for n2 in subpages: | |
print 'Grabbing names from %s...' % n2, | |
names = grab_names_from_subpage(n2) | |
grabbed = len(names) | |
total_count += grabbed | |
print '[done] - %d name(s) found - Total: %d' % (grabbed, total_count) | |
name_count.update(names) | |
if total_count >= NAME_COUNT_LIMIT: | |
abort = True | |
break | |
if abort: | |
break | |
found = find_rare(name_count) | |
print '%d name(s) found:\n' % len(found) | |
for n in found: | |
print n |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment