Skip to content

Instantly share code, notes, and snippets.

@ejetzer
Created October 17, 2016 03:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ejetzer/81a47eee5d553d5160dfe27269d31115 to your computer and use it in GitHub Desktop.
Save ejetzer/81a47eee5d553d5160dfe27269d31115 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import html.parser
import requests, shelve, pathlib
class ListParser(html.parser.HTMLParser):
def __init__(self, to_feed, *args, **kargs):
self.in_good = False
self.depth = []
self.titles, self.urls, self.topics, self.codes = [], [], [], []
self.good_class = 'program-course-title'
super().__init__(*args, **kargs)
if to_feed: self.feed(to_feed)
def is_good(self, tag, attrs):
# Make this better...
if ('class', self.good_class) in attrs:
self.urls.append(attrs[0][1])
return True
def handle_starttag(self, tag, attrs):
self.depth.append(tag)
if self.in_good:
a = ' '.join(['{}="{}"'.format(n, v) for n, v in attrs])
self.handle_data('<{} {}>'.format(tag, a))
if self.is_good(tag, attrs):
self.in_good = len(self.depth)
def handle_endtag(self, tag):
if len(self.depth) == self.in_good and\
self.depth[-1] == tag:
self.in_good = None
if self.depth[-1] == tag:
self.depth.pop(-1)
else:
raise Exception('Opening and closing tags don\'t match.')
if self.in_good:
self.handle_data('</{}>'.format(tag))
def handle_data(self, data):
data = data.strip()
if self.in_good and data:
data = data.split(' ')
topic, code = data[:2]
title = ' '.join(data[2:-2])
while '(' in title:
title = title[:-1].strip()
self.titles.append(title)
self.topics.append(topic)
self.codes.append(int(code[:3]))
def __iter__(self):
return zip(self.topics, self.codes, self.titles, self.urls)
class SearchParser(html.parser.HTMLParser):
def __init__(self, to_feed, *args, **kargs):
self.in_good = False
self.depth = []
self.urls = []
self.good_class = 'field-content'
super().__init__(*args, **kargs)
self.feed(to_feed)
def is_good(self, tag, attrs):
if tag == 'h4' and ('class', self.good_class) in attrs:
return True
def handle_starttag(self, tag, attrs):
self.depth.append(tag)
if self.in_good and tag == 'a':
self.urls.append('http://www.mcgill.ca'+attrs[0][1])
self.in_good = False
if self.is_good(tag, attrs):
self.in_good = True
def get_programs(pages, url, keyword, faculty):
programs = []
for page in range(pages):
query = {'search_api_views_fulltext': keyword, 'page': page, 'f%5B0%5D': faculty}
response = requests.get(url, params=query)
content = response.text
programs += SearchParser(content).urls
print('Done page', page+1)
return programs
def get_courses(programs):
freqs, descs = {}, {}
for program in programs:
program = program.strip() # The iterator keeps \n
response = requests.get(program)
content = response.text
#content = open(program, encoding='utf-8').read()
courses = ListParser(content)
for t, c, n, u in courses:
freqs[(t, c)] = freqs.get((t, c), 0) + 1
descs[(t, c)] = (n, 'http://www.mcgill.ca'+u)
return freqs, descs # List of XML trees
def display(freqs, descs, to=None):
freqs = sorted([(i[0], i[1], j) for i, j in freqs.items()], key=lambda x: x[2], reverse=True)
to_write = ['#Count,Topic,Code,Title,URL']
for topic, code, count in freqs:
if count < 5 or len(to_write) > 20: break
if 200 < code < 400:
to_write.append(','.join([str(count), topic, str(code)] + list(descs[(topic, code)])))
to_write = '\n'.join(to_write)
with open(to, 'w', encoding='utf-8') as f:
f.write(to_write)
print(to_write)
SEARCH_RESULTS = 'Search Results.csv'
PROGRAM_LIST = 'Program List {}.txt'
SUMMARY = 'Cours importants {}.csv'
def main():
with open(SEARCH_RESULTS) as f:
searches = [l.strip().split(',') for l in f][1:]
searches = [(int(p), u, q, f) for p, u, q, f in searches]
for i, s in enumerate(searches):
print('#%s' % i)
programs = get_programs(*s)
with open(PROGRAM_LIST.format(i), 'w') as f:
f.write('\n'.join(programs))
freqs, descs = get_courses(programs)
display(freqs, descs, SUMMARY.format(i))
if __name__ == '__main__':
main()
#Page count Base URL Keyword Faculty
3 http://www.mcgill.ca/study/2016-2017/programs/search Major field_faculty_code%3ASC
1 http://www.mcgill.ca/study/2016-2017/programs/search Major field_faculty_code%3ALW
1 http://www.mcgill.ca/study/2016-2017/programs/search Major field_faculty_code%3AED
11 http://www.mcgill.ca/study/2016-2017/programs/search Major
2 http://www.mcgill.ca/study/2016-2017/programs/search Honours field_faculty_code%3ASC
1 http://www.mcgill.ca/study/2016-2017/programs/search Honours field_faculty_code%3ALW
1 http://www.mcgill.ca/study/2016-2017/programs/search Honours field_faculty_code%3AED
8 http://www.mcgill.ca/study/2016-2017/programs/search Honours
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment