Created
October 17, 2016 03:09
-
-
Save ejetzer/81a47eee5d553d5160dfe27269d31115 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import html.parser | |
import requests, shelve, pathlib | |
class ListParser(html.parser.HTMLParser): | |
def __init__(self, to_feed, *args, **kargs): | |
self.in_good = False | |
self.depth = [] | |
self.titles, self.urls, self.topics, self.codes = [], [], [], [] | |
self.good_class = 'program-course-title' | |
super().__init__(*args, **kargs) | |
if to_feed: self.feed(to_feed) | |
def is_good(self, tag, attrs): | |
# Make this better... | |
if ('class', self.good_class) in attrs: | |
self.urls.append(attrs[0][1]) | |
return True | |
def handle_starttag(self, tag, attrs): | |
self.depth.append(tag) | |
if self.in_good: | |
a = ' '.join(['{}="{}"'.format(n, v) for n, v in attrs]) | |
self.handle_data('<{} {}>'.format(tag, a)) | |
if self.is_good(tag, attrs): | |
self.in_good = len(self.depth) | |
def handle_endtag(self, tag): | |
if len(self.depth) == self.in_good and\ | |
self.depth[-1] == tag: | |
self.in_good = None | |
if self.depth[-1] == tag: | |
self.depth.pop(-1) | |
else: | |
raise Exception('Opening and closing tags don\'t match.') | |
if self.in_good: | |
self.handle_data('</{}>'.format(tag)) | |
def handle_data(self, data): | |
data = data.strip() | |
if self.in_good and data: | |
data = data.split(' ') | |
topic, code = data[:2] | |
title = ' '.join(data[2:-2]) | |
while '(' in title: | |
title = title[:-1].strip() | |
self.titles.append(title) | |
self.topics.append(topic) | |
self.codes.append(int(code[:3])) | |
def __iter__(self): | |
return zip(self.topics, self.codes, self.titles, self.urls) | |
class SearchParser(html.parser.HTMLParser): | |
def __init__(self, to_feed, *args, **kargs): | |
self.in_good = False | |
self.depth = [] | |
self.urls = [] | |
self.good_class = 'field-content' | |
super().__init__(*args, **kargs) | |
self.feed(to_feed) | |
def is_good(self, tag, attrs): | |
if tag == 'h4' and ('class', self.good_class) in attrs: | |
return True | |
def handle_starttag(self, tag, attrs): | |
self.depth.append(tag) | |
if self.in_good and tag == 'a': | |
self.urls.append('http://www.mcgill.ca'+attrs[0][1]) | |
self.in_good = False | |
if self.is_good(tag, attrs): | |
self.in_good = True | |
def get_programs(pages, url, keyword, faculty): | |
programs = [] | |
for page in range(pages): | |
query = {'search_api_views_fulltext': keyword, 'page': page, 'f%5B0%5D': faculty} | |
response = requests.get(url, params=query) | |
content = response.text | |
programs += SearchParser(content).urls | |
print('Done page', page+1) | |
return programs | |
def get_courses(programs): | |
freqs, descs = {}, {} | |
for program in programs: | |
program = program.strip() # The iterator keeps \n | |
response = requests.get(program) | |
content = response.text | |
#content = open(program, encoding='utf-8').read() | |
courses = ListParser(content) | |
for t, c, n, u in courses: | |
freqs[(t, c)] = freqs.get((t, c), 0) + 1 | |
descs[(t, c)] = (n, 'http://www.mcgill.ca'+u) | |
return freqs, descs # List of XML trees | |
def display(freqs, descs, to=None): | |
freqs = sorted([(i[0], i[1], j) for i, j in freqs.items()], key=lambda x: x[2], reverse=True) | |
to_write = ['#Count,Topic,Code,Title,URL'] | |
for topic, code, count in freqs: | |
if count < 5 or len(to_write) > 20: break | |
if 200 < code < 400: | |
to_write.append(','.join([str(count), topic, str(code)] + list(descs[(topic, code)]))) | |
to_write = '\n'.join(to_write) | |
with open(to, 'w', encoding='utf-8') as f: | |
f.write(to_write) | |
print(to_write) | |
SEARCH_RESULTS = 'Search Results.csv' | |
PROGRAM_LIST = 'Program List {}.txt' | |
SUMMARY = 'Cours importants {}.csv' | |
def main(): | |
with open(SEARCH_RESULTS) as f: | |
searches = [l.strip().split(',') for l in f][1:] | |
searches = [(int(p), u, q, f) for p, u, q, f in searches] | |
for i, s in enumerate(searches): | |
print('#%s' % i) | |
programs = get_programs(*s) | |
with open(PROGRAM_LIST.format(i), 'w') as f: | |
f.write('\n'.join(programs)) | |
freqs, descs = get_courses(programs) | |
display(freqs, descs, SUMMARY.format(i)) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Page count | Base URL | Keyword | Faculty | |
---|---|---|---|---|
3 | http://www.mcgill.ca/study/2016-2017/programs/search | Major | field_faculty_code%3ASC | |
1 | http://www.mcgill.ca/study/2016-2017/programs/search | Major | field_faculty_code%3ALW | |
1 | http://www.mcgill.ca/study/2016-2017/programs/search | Major | field_faculty_code%3AED | |
11 | http://www.mcgill.ca/study/2016-2017/programs/search | Major | ||
2 | http://www.mcgill.ca/study/2016-2017/programs/search | Honours | field_faculty_code%3ASC | |
1 | http://www.mcgill.ca/study/2016-2017/programs/search | Honours | field_faculty_code%3ALW | |
1 | http://www.mcgill.ca/study/2016-2017/programs/search | Honours | field_faculty_code%3AED | |
8 | http://www.mcgill.ca/study/2016-2017/programs/search | Honours |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment