Skip to content

Instantly share code, notes, and snippets.

@sueszli
Last active August 8, 2023 23:46
Show Gist options
  • Save sueszli/cd8196dc03df4b0a6a07898db2c64368 to your computer and use it in GitHub Desktop.
Save sueszli/cd8196dc03df4b0a6a07898db2c64368 to your computer and use it in GitHub Desktop.
scrape all pdf files containing old exams from the semi-structred-data course at TU Wien
import requests
from bs4 import BeautifulSoup
import os
import PyPDF2
EXAM_DIRECTORY_PATH = "./exams"
SOLUTION_DIRECTORY_PATH = "./solutions"
# parse page
url = "https://dbai.tuwien.ac.at/education/ssd/pruefung/"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
checkmark = "\u2713"
print(checkmark, "Reached website")
# scrape links
examLinks = []
solutionLinks = []
for link in soup.find_all("a"):
href = link.get("href")
if href is not None:
if href.endswith("-pruefung.pdf"):
examLinks.append(href)
if href.endswith("-muster.pdf"):
solutionLinks.append(href)
print(checkmark, "Found", len(examLinks), "exams")
print(checkmark, "Found", len(solutionLinks), "exam solution")
def downloadPdf(folderName, list):
if not os.path.exists(folderName):
os.makedirs(folderName)
for elem in list:
pdf = requests.get(url + elem)
path = folderName + "/" + elem
file = open(path, "wb")
file.write(pdf.content)
file.close()
print("Downloading exams...")
downloadPdf(EXAM_DIRECTORY_PATH, examLinks)
print(checkmark, "Downloaded all exams")
print("Downloading solutions...")
downloadPdf(SOLUTION_DIRECTORY_PATH, solutionLinks)
print(checkmark, "Downloaded all solutions")
def extractDateFromText(text, elem):
date = ""
for line in text.split("\n"):
if "184.705" in line:
date = line.split("184.705", 1)[1]
if "181.135" in line:
date = line.split("181.135", 1)[1]
if date == "":
print("Date couldn't be found in ", elem)
date = "unknown"
date = date.replace(" ", "")
# turn DD.MM.YYYY to YYYY.MM.DD
elems = date.split(".")
return elems[2] + "." + elems[1] + "." + elems[0]
def rename(directoryPath):
for elem in os.listdir(directoryPath):
path = directoryPath + "/" + elem
with open(path, "rb") as file:
text = PyPDF2.PdfFileReader(file).pages[0].extract_text()
date = extractDateFromText(text, elem)
newPath = directoryPath + "/" + date + ".pdf"
# rename file (add '[V2]' to the end of the file name if it already exists)
if not os.path.exists(newPath):
os.rename(path, newPath)
else:
alternativePath = directoryPath + "/" + date + "[V2]" + ".pdf"
os.rename(path, alternativePath)
print("Changing names of exams to their dates...")
rename(EXAM_DIRECTORY_PATH)
print(checkmark, "Renamed all exams")
print("Changing names of solutions to their dates...")
rename(SOLUTION_DIRECTORY_PATH)
print(checkmark, "Renamed all solutions")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment