Skip to content

Instantly share code, notes, and snippets.

@Hypro999
Last active January 16, 2020 09:30
Show Gist options
  • Save Hypro999/a34b82acd9fd5f35f95c546671e3ea06 to your computer and use it in GitHub Desktop.
Save Hypro999/a34b82acd9fd5f35f95c546671e3ea06 to your computer and use it in GitHub Desktop.
BITS Pilani Question Papers Parser: Automatically scan all links in the library's portal's question papers section to find the question papers you want.

Dependencies:

Make sure that you have these installed first.

  1. Python3.6 or a higher version.
  2. The python requests library: pip3 install requests or pip install requests.
  3. The python beautifulsoup4 library: pip3 install beautifulsoup4 or pip install beautifulsoup4.

Usage:

  1. Copy this script to a location where your user has sufficient read/write privileges.
  2. Ensure that you have installed all of the pre-requisites/dependencies.
  3. Run the python script using: python BITS_Pilani_Question_Papers_Parser.py
    Additional Notes for Linux users: you may need to use "python3" instead of "python" when running this. Alternatively, you can change the permissions of the script using chmod u+x BITS_Pilani_Question_Papers_Parser.py and then run the script using ./BITS_Pilani_Question_Papers_Parser.py. You basically have more options and advantages.
  4. Enter the course id of the course you want to download questions papers for. E.G. "CS F469"
    Note: Not all courses have question papers available on the library portal. It's a sad fact.
  5. Enter your username and password
    Note: on Linux your password is hidden and won't echoed onto the screen
  6. Let the script go though the relevant links and find any questions papers that might exist.
  7. At the end (about 3-5 seconds later), if everything went well, you'll be asked if you want to download the papers that it found (if it found any). Enter y/n based on what you want.
  8. Repeat steps 3-7 for any more courses you want papers for.

And that's about it. You saved a bunch of time!

#!/usr/bin/env python3
import os
import sys
import requests
from getpass import getpass
from typing import List
from bs4 import BeautifulSoup
### Script Variables
FOLDER_NAME = "QuestionPapers"
DOWNLOADS_ROOT_DIRECTORY = os.path.join(os.path.dirname(os.path.abspath(__file__)), FOLDER_NAME)
INTRANET_BASE_URL = "http://library"
INTERNET_BASE_URL = "http://www.bits-pilani.ac.in:12354"
### Helper Functions
def content(response: requests.models.Response) -> str:
return response.content.decode("utf-8")
def red(string: str) -> str:
if os.name == "nt":
# Compatibility layer for Windows: disable color.
return string
return "\033[91m{}\033[0m".format(string)
def green(string: str) -> str:
if os.name == "nt":
return string
return "\033[92m{}\033[0m".format(string)
def blue(string: str) -> str:
if os.name == "nt":
return string
return "\033[94m{}\033[0m".format(string)
def yellow(string: str) -> str:
if os.name == "nt":
return string
return "\033[93m{}\033[0m".format(string)
def get_absolute_url(session: requests.Session, relative_url: str) -> str:
if session.cookies.list_domains()[-1] == "library.local":
base_url = INTRANET_BASE_URL
else:
base_url = INTERNET_BASE_URL
return "{}/{}".format(base_url, relative_url)
### Main Functions
def obtainSessionToken(session: requests.Session) -> None:
""" Get an unauthenticated session token from the server. Also perform a *simple* check to
make sure that the user is connected to the BITS Network. This is not exactly meant to be
a security measure and is more of a reminder. Phishing is still possible. The session token
is PHPSESSID. """
print(blue("Attempting to establish an intranet connection..."), end=" ")
response = session.get("{}/{}".format(INTRANET_BASE_URL, "login.php"))
response_content = content(response)[-150::]
if not "BITS-Library, BITS-Pilani, India." in response_content:
print(red("Failed."))
else:
print(green("Success."))
return
print(blue("Attempting to establish an internet connection..."), end=" ")
response = session.get(get_absolute_url("{}/{}".format(INTERNET_BASE_URL, "login.php")))
response_content = content(response)[-150::]
if not "BITS-Library, BITS-Pilani, India." in response_content:
print(red("Failed."))
print(red("Could not connect to the library portal."))
exit(1)
else:
print(green("Success."))
return
def authenticateSession(session: requests.Session, username: str, password: str) -> None:
print(blue("Attempting to authenticate session..."), end=" ")
data = {"bitsid_psrn": username, "password": password, "submit": "Login"}
response = session.post(get_absolute_url(session, "login.php"), data=data)
response_content = content(response)
if "Please Sign Up first then Login" in response_content:
print(red("Falied.\nIncorrect username."))
exit(1)
if "Invalid Credentials" in response_content:
print(red("Failed.\nIncorrect password."))
exit(1)
print(green("Success."))
return
def collectTopLevelLinks(session: requests.Session) -> List[str]:
""" Find all of the top level (main) links to crawl. The pages at these links will contain the question
papers' links. """
print(blue("Attempting to collect top-level links..."), end=" ")
response = session.get(get_absolute_url(session, "services/question_paper.php"))
parser = BeautifulSoup(content(response), "html.parser")
top_level_links_filter = lambda x: "Semester" in x.getText()
top_level_links = [tag["href"] for tag in filter(top_level_links_filter, parser.find_all("a"))]
print(green("Success."))
return top_level_links
def collectDocumentLinks(session: requests.Session, top_level_links: List[str], course_id: str) -> List[str]:
""" Go through each top level link and identify the lower level links to the documents we want to download. """
document_links = []
document_links_filter = lambda x: course_id == x.getText()
print(blue("Analyzing top-level links to collect document links..."))
for link in top_level_links:
print(blue("Analyzing: \"{}\"...".format(link)), end=" ")
response = session.post(get_absolute_url(session, link[3:]))
try:
response_content = content(response)
parser = BeautifulSoup(response_content, "html.parser")
for element in filter(document_links_filter, parser.find_all("td")):
document_links.append([_ for _ in element.next_sibling.children][0]["href"])
print(green("OK."))
except UnicodeDecodeError:
# TODO: find out what's causing this.
print(red("Failed.".format(link)))
if len(document_links) == 0:
print(yellow("Found no matching documents."))
exit(0)
return document_links
def createDownloadsRootDirectory() -> None:
""" Create the DOWNLOADS_ROOT_DIRECTORY if it does not exist. """
if not os.path.exists(DOWNLOADS_ROOT_DIRECTORY):
print(yellow("Creating new directory: {}".format(DOWNLOADS_ROOT_DIRECTORY)))
os.mkdir(DOWNLOADS_ROOT_DIRECTORY)
def downloadDocuments(session: requests.Session, document_links: List[str]) -> None:
""" Contact each of the lower level links to get the document it contains and save it. """
for link in document_links:
link = get_absolute_url(session, link[3:])
filename = link.split("/")[-1]
print(blue("Downloading \"{}\"...".format(filename)), end=" ")
response = session.post(link)
with open(os.path.join(DOWNLOADS_ROOT_DIRECTORY, filename), "wb+") as f:
f.write(response.content)
print(green("Success."))
### Script Runner
if __name__ == "__main__":
if not sys.version_info[0] == 3:
print(yellow("*gasp* you're still using Python 2?! Use Python3!"))
exit(1)
session = requests.session()
course_id = input(blue("course id: ")) # e.g. CS F469
username = input(blue("username: "))
if os.name == "nt":
password = input(blue("password: "))
else:
password = getpass(blue("password: "))
try:
obtainSessionToken(session)
authenticateSession(session, username, password)
top_level_links = collectTopLevelLinks(session)
document_links = collectDocumentLinks(session, top_level_links, course_id)
choice = ""
print(blue("Found the following document links:"))
for count, document_link in enumerate(document_links):
print("\t{}. {}".format(count, document_link))
while not choice in ["y", "n"]:
choice = input(blue("\nDownload them (y/n)?"))
if choice == "y":
createDownloadsRootDirectory()
downloadDocuments(session, document_links)
elif choice == "n":
exit(0)
print(green("\nMission complete!"))
except requests.exceptions.ConnectionError:
print(red("Failed."))
print(yellow("Could not establish a stable connection with the library portal. The internet connection sucks as usual..."))
exit(1)
except PermissionError:
print(red("Failed."))
print(yellow("OS: Permission error."))
exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment