Skip to content

Instantly share code, notes, and snippets.

@darrenburns
Created October 25, 2013 16:15
Show Gist options
  • Save darrenburns/7157314 to your computer and use it in GitHub Desktop.
Save darrenburns/7157314 to your computer and use it in GitHub Desktop.
A web scraping script to download the latest PDF files from Glasgow Unis Moodle1 page and store them in relevantly named directories.
# Program to scrape latest documents from Moodle1
# Store files automatically in relevant folders.
from bs4 import BeautifulSoup
import os
import requests
USERNAME = "" # Moodle username
PASSWORD = "" # Moodle password
COURSES = ["2A", "2B", "2F"] # Names of courses or course codes
BASE_URL = "http://fims.moodle.gla.ac.uk/"
LOGIN_URL = "https://fims.moodle.gla.ac.uk/login/index.php"
# Takes a username and password and returns a logged in
# Session instance. Required for cookies etc. to persist
def moodle_session(username, password):
payload = {
"username": username,
"password": password
}
s = requests.Session() # Create a new Session instance
s.post(LOGIN_URL, data = payload)
return s
# Takes a url, generally the Moodle homepage, and returns links to all
# of the courses as a dictionary
def get_page_links(session, courses):
html = session.get(BASE_URL).text
soup = BeautifulSoup(html, "lxml")
links = soup.find_all("a")
link_dict = {}
for link in links:
for course_code in courses:
if course_code in link.text and link["href"] not in link_dict:
link_dict[link.text.strip()] = link["href"]
return link_dict
# Takes a dictionary of links to Moodle pages and returns a dictionary
# of download links to all of the PDF files on those pages
def get_links(session, page_links):
links_to_pdfs = {}
for subject in page_links:
html = session.get(page_links[subject]).text
soup = BeautifulSoup(html, "lxml")
data = soup.find_all("li", attrs={"class": "activity resource"})
links_to_pdfs[subject] = {}
dup = 0 # Count duplicate file names to prevent overwriting dictionary values
# Store PDF links in a dictionary
for li in data:
links = li.find_all("a")
for a in links:
if "PDF document" in a.text:
doc_name = a.text.replace(" PDF document", "")
if doc_name in links_to_pdfs[subject]:
dup += 1
links_to_pdfs[subject][doc_name + "_" + str(dup)] = a["href"]
else:
links_to_pdfs[subject][doc_name] = a["href"]
return links_to_pdfs
# Takes a dictionary and creates file directories from the keys if the
# directories do not already exist
def create_dirs(links_to_pdfs):
if not os.path.exists("Moodle documents"):
os.makedirs("Moodle documents")
for subject in links_to_pdfs:
if not os.path.exists("Moodle documents/" + subject):
os.makedirs("Moodle documents/" + subject)
# Takes a dictionary of links and downloads them into the appropriate
# directories based on the outermost keys (the subject names)
def fill_dirs(session, links_to_pdfs):
for course in links_to_pdfs:
for pdf in links_to_pdfs[course]:
req = session.get(links_to_pdfs[course][pdf])
file_url = req.url
if ".pdf" in file_url:
file_name = file_url.split("/")[-1]
if not os.path.exists("Moodle documents/"+course+"/"+file_name):
with open("Moodle documents/"+course+"/"+file_name, "wb") as pdf_file:
pdf_file.write(req.content)
def end_session(session):
session.close()
session = moodle_session(USERNAME, PASSWORD)
page_links = get_page_links(session, COURSES)
links_to_pdfs = get_links(session, page_links)
create_dirs(links_to_pdfs)
fill_dirs(session, links_to_pdfs)
end_session(session)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment