Created
October 25, 2013 16:15
-
-
Save darrenburns/7157314 to your computer and use it in GitHub Desktop.
A web scraping script to download the latest PDF files from Glasgow Unis Moodle1 page and store them in relevantly named directories.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Program to scrape latest documents from Moodle1 | |
# Store files automatically in relevant folders. | |
from bs4 import BeautifulSoup | |
import os | |
import requests | |
USERNAME = "" # Moodle username | |
PASSWORD = "" # Moodle password | |
COURSES = ["2A", "2B", "2F"] # Names of courses or course codes | |
BASE_URL = "http://fims.moodle.gla.ac.uk/" | |
LOGIN_URL = "https://fims.moodle.gla.ac.uk/login/index.php" | |
# Takes a username and password and returns a logged in | |
# Session instance. Required for cookies etc. to persist | |
def moodle_session(username, password): | |
payload = { | |
"username": username, | |
"password": password | |
} | |
s = requests.Session() # Create a new Session instance | |
s.post(LOGIN_URL, data = payload) | |
return s | |
# Takes a url, generally the Moodle homepage, and returns links to all | |
# of the courses as a dictionary | |
def get_page_links(session, courses): | |
html = session.get(BASE_URL).text | |
soup = BeautifulSoup(html, "lxml") | |
links = soup.find_all("a") | |
link_dict = {} | |
for link in links: | |
for course_code in courses: | |
if course_code in link.text and link["href"] not in link_dict: | |
link_dict[link.text.strip()] = link["href"] | |
return link_dict | |
# Takes a dictionary of links to Moodle pages and returns a dictionary | |
# of download links to all of the PDF files on those pages | |
def get_links(session, page_links): | |
links_to_pdfs = {} | |
for subject in page_links: | |
html = session.get(page_links[subject]).text | |
soup = BeautifulSoup(html, "lxml") | |
data = soup.find_all("li", attrs={"class": "activity resource"}) | |
links_to_pdfs[subject] = {} | |
dup = 0 # Count duplicate file names to prevent overwriting dictionary values | |
# Store PDF links in a dictionary | |
for li in data: | |
links = li.find_all("a") | |
for a in links: | |
if "PDF document" in a.text: | |
doc_name = a.text.replace(" PDF document", "") | |
if doc_name in links_to_pdfs[subject]: | |
dup += 1 | |
links_to_pdfs[subject][doc_name + "_" + str(dup)] = a["href"] | |
else: | |
links_to_pdfs[subject][doc_name] = a["href"] | |
return links_to_pdfs | |
# Takes a dictionary and creates file directories from the keys if the | |
# directories do not already exist | |
def create_dirs(links_to_pdfs): | |
if not os.path.exists("Moodle documents"): | |
os.makedirs("Moodle documents") | |
for subject in links_to_pdfs: | |
if not os.path.exists("Moodle documents/" + subject): | |
os.makedirs("Moodle documents/" + subject) | |
# Takes a dictionary of links and downloads them into the appropriate | |
# directories based on the outermost keys (the subject names) | |
def fill_dirs(session, links_to_pdfs): | |
for course in links_to_pdfs: | |
for pdf in links_to_pdfs[course]: | |
req = session.get(links_to_pdfs[course][pdf]) | |
file_url = req.url | |
if ".pdf" in file_url: | |
file_name = file_url.split("/")[-1] | |
if not os.path.exists("Moodle documents/"+course+"/"+file_name): | |
with open("Moodle documents/"+course+"/"+file_name, "wb") as pdf_file: | |
pdf_file.write(req.content) | |
def end_session(session): | |
session.close() | |
session = moodle_session(USERNAME, PASSWORD) | |
page_links = get_page_links(session, COURSES) | |
links_to_pdfs = get_links(session, page_links) | |
create_dirs(links_to_pdfs) | |
fill_dirs(session, links_to_pdfs) | |
end_session(session) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment