Skip to content

Instantly share code, notes, and snippets.

@gnrfan
Last active December 26, 2015 09:28
Show Gist options
  • Save gnrfan/7129280 to your computer and use it in GitHub Desktop.
Save gnrfan/7129280 to your computer and use it in GitHub Desktop.
Grabs unique names from the San Marcos University (Peru) applicant listings.
import urllib2
import re
from collections import Counter
url = 'http://profe-alexz.blogspot.com/2013/09/resultados-examen-admision-san-marcos-2014-i.html'
NAME_COUNT_LIMIT = 1000000
def grab_page(url):
website = urllib2.urlopen(url)
return website.read()
def grab_items(regexp, source):
return re.findall(regexp, source)
def base_folder_url(url):
return ''.join(['/'.join(url.split('/')[:-1]), '/'])
def composed_link(base_url, document):
return ''.join([base_url, document])
def grab_listing_urls():
return grab_items(
'"(http://www.admision.unmsm.edu.pe/res20130914/.*?)"',
grab_page(url)
)
def grab_subpage_docs_from_listing(listing_url):
return grab_items('([0-9]+\.html)', grab_page(listing_url))
def grab_subpage_urls_from_listing(listing_url):
results = []
base_url = base_folder_url(listing_url)
documents = grab_subpage_docs_from_listing(listing_url)
for d in documents:
results.append( composed_link(base_url, d) )
return results
def grab_names_from_subpage(subpage_url):
results = []
aux = [
item.replace('<td>', '').split(',')[1].strip() for item in grab_items(
'<tr>(.+)</td><td>[0-9]+</td>',
grab_page(subpage_url)
)[0].split('</td>') if ',' in item]
excluded = ['DE', 'DEL', 'EL', 'LA', 'LAS', 'LO', 'LOS']
for item in aux:
filtered = [item for item in item.split(' ') if item not in excluded]
results.extend(filtered)
return results
def find_by_test(counter, test):
return sorted([item[0] for item in counter.items() if test(item)])
def find_uniques(counter):
return find_by_test(counter, lambda item: item[1] == 1)
def find_frequent(counter):
return find_by_test(counter, lambda item: item[1] >= 5)
def find_infrequent(counter):
return find_by_test(counter, lambda item: item[1] < 5)
def find_rare(counter):
return find_by_test(counter, lambda item: item[1] <= 2)
name_count = Counter()
listings = grab_listing_urls()
total_count = 0
abort = False
for n1 in listings:
subpages = grab_subpage_urls_from_listing(n1)
for n2 in subpages:
print 'Grabbing names from %s...' % n2,
names = grab_names_from_subpage(n2)
grabbed = len(names)
total_count += grabbed
print '[done] - %d name(s) found - Total: %d' % (grabbed, total_count)
name_count.update(names)
if total_count >= NAME_COUNT_LIMIT:
abort = True
break
if abort:
break
found = find_rare(name_count)
print '%d name(s) found:\n' % len(found)
for n in found:
print n
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment