Hypro999/bppc-question-papers-parser.py

## readme.md

      
    Raw
  

              readme.md
            
          
    Dependencies:

Make sure that you have these installed first.

Python3.6 or a higher version.
The python requests library: pip3 install requests or pip install requests.
The python beautifulsoup4 library: pip3 install beautifulsoup4 or pip install beautifulsoup4.

Usage:


Copy this script to a location where your user has sufficient read/write privileges.
Ensure that you have installed all of the pre-requisites/dependencies.
Run the python script using: python BITS_Pilani_Question_Papers_Parser.py

Additional Notes for Linux users: you may need to use "python3" instead of "python" when running this.
Alternatively, you can change the permissions of the script using chmod u+x BITS_Pilani_Question_Papers_Parser.py
and then run the script using ./BITS_Pilani_Question_Papers_Parser.py. You basically have more options and advantages.
Enter the course id of the course you want to download questions papers for. E.G. "CS F469"

Note: Not all courses have question papers available on the library portal. It's a sad fact.
Enter your username and password

Note: on Linux your password is hidden and won't echoed onto the screen
Let the script go though the relevant links and find any questions papers that might exist.
At the end (about 3-5 seconds later), if everything went well, you'll be asked if you want to download the papers
that it found (if it found any). Enter y/n based on what you want.
Repeat steps 3-7 for any more courses you want papers for.

And that's about it. You saved a bunch of time!

  
## bppc-question-papers-parser.py
#!/usr/bin/env python3

import os
import sys
import requests
from getpass import getpass
from typing import List
from bs4 import BeautifulSoup


### Script Variables
FOLDER_NAME = "QuestionPapers"
DOWNLOADS_ROOT_DIRECTORY = os.path.join(os.path.dirname(os.path.abspath(__file__)), FOLDER_NAME)

INTRANET_BASE_URL = "http://library"
INTERNET_BASE_URL = "http://www.bits-pilani.ac.in:12354"


### Helper Functions
def content(response: requests.models.Response) -> str:
    return response.content.decode("utf-8")

def red(string: str) -> str:
    if os.name == "nt":
        # Compatibility layer for Windows: disable color.
        return string
    return "\033[91m{}\033[0m".format(string)

def green(string: str) -> str:
    if os.name == "nt":
        return string
    return "\033[92m{}\033[0m".format(string)

def blue(string: str) -> str:
    if os.name == "nt":
        return string
    return "\033[94m{}\033[0m".format(string)

def yellow(string: str) -> str:
    if os.name == "nt":
        return string
    return "\033[93m{}\033[0m".format(string)

def get_absolute_url(session: requests.Session, relative_url: str) -> str:
    if session.cookies.list_domains()[-1] == "library.local":
        base_url = INTRANET_BASE_URL
    else:
        base_url = INTERNET_BASE_URL
    return "{}/{}".format(base_url, relative_url)


### Main Functions
def obtainSessionToken(session: requests.Session) -> None:
    """ Get an unauthenticated session token from the server. Also perform a *simple* check to
    make sure that the user is connected to the BITS Network. This is not exactly meant to be
    a security measure and is more of a reminder. Phishing is still possible. The session token
    is PHPSESSID. """

    print(blue("Attempting to establish an intranet connection..."), end=" ")
    response = session.get("{}/{}".format(INTRANET_BASE_URL, "login.php"))
    response_content = content(response)[-150::]
    if not "BITS-Library, BITS-Pilani, India." in response_content:
        print(red("Failed."))
    else:
        print(green("Success."))
        return

    print(blue("Attempting to establish an internet connection..."), end=" ")
    response = session.get(get_absolute_url("{}/{}".format(INTERNET_BASE_URL, "login.php")))
    response_content = content(response)[-150::]
    if not "BITS-Library, BITS-Pilani, India." in response_content:
        print(red("Failed."))
        print(red("Could not connect to the library portal."))
        exit(1)
    else:
        print(green("Success."))
        return

def authenticateSession(session: requests.Session, username: str, password: str) -> None:
    print(blue("Attempting to authenticate session..."), end=" ")
    data = {"bitsid_psrn": username, "password": password, "submit": "Login"}
    response = session.post(get_absolute_url(session, "login.php"), data=data)
    response_content = content(response)
    if "Please Sign Up first then Login" in response_content:
        print(red("Falied.\nIncorrect username."))
        exit(1)
    if "Invalid Credentials" in response_content:
        print(red("Failed.\nIncorrect password."))
        exit(1)
    print(green("Success."))
    return

def collectTopLevelLinks(session: requests.Session) -> List[str]:
    """ Find all of the top level (main) links to crawl. The pages at these links will contain the question
        papers' links. """
    print(blue("Attempting to collect top-level links..."), end=" ")
    response = session.get(get_absolute_url(session, "services/question_paper.php"))
    parser = BeautifulSoup(content(response), "html.parser")
    top_level_links_filter = lambda x: "Semester" in x.getText()
    top_level_links = [tag["href"] for tag in filter(top_level_links_filter, parser.find_all("a"))]
    print(green("Success."))
    return top_level_links

def collectDocumentLinks(session: requests.Session, top_level_links: List[str], course_id: str) -> List[str]:
    """ Go through each top level link and identify the lower level links to the documents we want to download. """
    document_links = []
    document_links_filter = lambda x: course_id == x.getText()
    print(blue("Analyzing top-level links to collect document links..."))
    for link in top_level_links:
        print(blue("Analyzing: \"{}\"...".format(link)), end=" ")
        response = session.post(get_absolute_url(session, link[3:]))
        try:
            response_content = content(response)
            parser = BeautifulSoup(response_content, "html.parser")
            for element in filter(document_links_filter, parser.find_all("td")):
                document_links.append([_ for _ in element.next_sibling.children][0]["href"])
            print(green("OK."))
        except UnicodeDecodeError:
            # TODO: find out what's causing this.
            print(red("Failed.".format(link)))
    if len(document_links) == 0:
        print(yellow("Found no matching documents."))
        exit(0)
    return document_links

def createDownloadsRootDirectory() -> None:
    """ Create the DOWNLOADS_ROOT_DIRECTORY if it does not exist. """
    if not os.path.exists(DOWNLOADS_ROOT_DIRECTORY):
        print(yellow("Creating new directory: {}".format(DOWNLOADS_ROOT_DIRECTORY)))
        os.mkdir(DOWNLOADS_ROOT_DIRECTORY)

def downloadDocuments(session: requests.Session, document_links: List[str]) -> None:
    """ Contact each of the lower level links to get the document it contains and save it. """
    for link in document_links:
        link = get_absolute_url(session, link[3:])
        filename = link.split("/")[-1]
        print(blue("Downloading \"{}\"...".format(filename)), end=" ")
        response = session.post(link)
        with open(os.path.join(DOWNLOADS_ROOT_DIRECTORY, filename), "wb+") as f:
            f.write(response.content)
        print(green("Success."))


### Script Runner
if __name__ == "__main__":
    if not sys.version_info[0] == 3:
        print(yellow("*gasp* you're still using Python 2?! Use Python3!"))
        exit(1)
    session = requests.session()
    course_id = input(blue("course id: "))  # e.g. CS F469
    username = input(blue("username: "))
    if os.name == "nt":
        password = input(blue("password: "))
    else:
        password = getpass(blue("password: "))


    try:
        obtainSessionToken(session)
        authenticateSession(session, username, password)
        top_level_links = collectTopLevelLinks(session)
        document_links = collectDocumentLinks(session, top_level_links, course_id)

        choice = ""
        print(blue("Found the following document links:"))
        for count, document_link in enumerate(document_links):
            print("\t{}. {}".format(count, document_link))
        while not choice in ["y", "n"]:
            choice = input(blue("\nDownload them (y/n)?"))
            if choice == "y":
                createDownloadsRootDirectory()
                downloadDocuments(session, document_links)
            elif choice == "n":
                exit(0)

        print(green("\nMission complete!"))
    except requests.exceptions.ConnectionError:
        print(red("Failed."))
        print(yellow("Could not establish a stable connection with the library portal. The internet connection sucks as usual..."))
        exit(1)
    except PermissionError:
        print(red("Failed."))
        print(yellow("OS: Permission error."))
        exit(1)
	#!/usr/bin/env python3

	import os
	import sys
	import requests
	from getpass import getpass
	from typing import List
	from bs4 import BeautifulSoup


	### Script Variables
	FOLDER_NAME = "QuestionPapers"
	DOWNLOADS_ROOT_DIRECTORY = os.path.join(os.path.dirname(os.path.abspath(__file__)), FOLDER_NAME)

	INTRANET_BASE_URL = "http://library"
	INTERNET_BASE_URL = "http://www.bits-pilani.ac.in:12354"


	### Helper Functions
	def content(response: requests.models.Response) -> str:
	return response.content.decode("utf-8")

	def red(string: str) -> str:
	if os.name == "nt":
	# Compatibility layer for Windows: disable color.
	return string
	return "\033[91m{}\033[0m".format(string)

	def green(string: str) -> str:
	if os.name == "nt":
	return string
	return "\033[92m{}\033[0m".format(string)

	def blue(string: str) -> str:
	if os.name == "nt":
	return string
	return "\033[94m{}\033[0m".format(string)

	def yellow(string: str) -> str:
	if os.name == "nt":
	return string
	return "\033[93m{}\033[0m".format(string)

	def get_absolute_url(session: requests.Session, relative_url: str) -> str:
	if session.cookies.list_domains()[-1] == "library.local":
	base_url = INTRANET_BASE_URL
	else:
	base_url = INTERNET_BASE_URL
	return "{}/{}".format(base_url, relative_url)


	### Main Functions
	def obtainSessionToken(session: requests.Session) -> None:
	""" Get an unauthenticated session token from the server. Also perform a simple check to
	make sure that the user is connected to the BITS Network. This is not exactly meant to be
	a security measure and is more of a reminder. Phishing is still possible. The session token
	is PHPSESSID. """

	print(blue("Attempting to establish an intranet connection..."), end=" ")
	response = session.get("{}/{}".format(INTRANET_BASE_URL, "login.php"))
	response_content = content(response)[-150::]
	if not "BITS-Library, BITS-Pilani, India." in response_content:
	print(red("Failed."))
	else:
	print(green("Success."))
	return

	print(blue("Attempting to establish an internet connection..."), end=" ")
	response = session.get(get_absolute_url("{}/{}".format(INTERNET_BASE_URL, "login.php")))
	response_content = content(response)[-150::]
	if not "BITS-Library, BITS-Pilani, India." in response_content:
	print(red("Failed."))
	print(red("Could not connect to the library portal."))
	exit(1)
	else:
	print(green("Success."))
	return

	def authenticateSession(session: requests.Session, username: str, password: str) -> None:
	print(blue("Attempting to authenticate session..."), end=" ")
	data = {"bitsid_psrn": username, "password": password, "submit": "Login"}
	response = session.post(get_absolute_url(session, "login.php"), data=data)
	response_content = content(response)
	if "Please Sign Up first then Login" in response_content:
	print(red("Falied.\nIncorrect username."))
	exit(1)
	if "Invalid Credentials" in response_content:
	print(red("Failed.\nIncorrect password."))
	exit(1)
	print(green("Success."))
	return

	def collectTopLevelLinks(session: requests.Session) -> List[str]:
	""" Find all of the top level (main) links to crawl. The pages at these links will contain the question
	papers' links. """
	print(blue("Attempting to collect top-level links..."), end=" ")
	response = session.get(get_absolute_url(session, "services/question_paper.php"))
	parser = BeautifulSoup(content(response), "html.parser")
	top_level_links_filter = lambda x: "Semester" in x.getText()
	top_level_links = [tag["href"] for tag in filter(top_level_links_filter, parser.find_all("a"))]
	print(green("Success."))
	return top_level_links

	def collectDocumentLinks(session: requests.Session, top_level_links: List[str], course_id: str) -> List[str]:
	""" Go through each top level link and identify the lower level links to the documents we want to download. """
	document_links = []
	document_links_filter = lambda x: course_id == x.getText()
	print(blue("Analyzing top-level links to collect document links..."))
	for link in top_level_links:
	print(blue("Analyzing: \"{}\"...".format(link)), end=" ")
	response = session.post(get_absolute_url(session, link[3:]))
	try:
	response_content = content(response)
	parser = BeautifulSoup(response_content, "html.parser")
	for element in filter(document_links_filter, parser.find_all("td")):
	document_links.append([_ for _ in element.next_sibling.children][0]["href"])
	print(green("OK."))
	except UnicodeDecodeError:
	# TODO: find out what's causing this.
	print(red("Failed.".format(link)))
	if len(document_links) == 0:
	print(yellow("Found no matching documents."))
	exit(0)
	return document_links

	def createDownloadsRootDirectory() -> None:
	""" Create the DOWNLOADS_ROOT_DIRECTORY if it does not exist. """
	if not os.path.exists(DOWNLOADS_ROOT_DIRECTORY):
	print(yellow("Creating new directory: {}".format(DOWNLOADS_ROOT_DIRECTORY)))
	os.mkdir(DOWNLOADS_ROOT_DIRECTORY)

	def downloadDocuments(session: requests.Session, document_links: List[str]) -> None:
	""" Contact each of the lower level links to get the document it contains and save it. """
	for link in document_links:
	link = get_absolute_url(session, link[3:])
	filename = link.split("/")[-1]
	print(blue("Downloading \"{}\"...".format(filename)), end=" ")
	response = session.post(link)
	with open(os.path.join(DOWNLOADS_ROOT_DIRECTORY, filename), "wb+") as f:
	f.write(response.content)
	print(green("Success."))


	### Script Runner
	if __name__ == "__main__":
	if not sys.version_info[0] == 3:
	print(yellow("gasp you're still using Python 2?! Use Python3!"))
	exit(1)
	session = requests.session()
	course_id = input(blue("course id: ")) # e.g. CS F469
	username = input(blue("username: "))
	if os.name == "nt":
	password = input(blue("password: "))
	else:
	password = getpass(blue("password: "))


	try:
	obtainSessionToken(session)
	authenticateSession(session, username, password)
	top_level_links = collectTopLevelLinks(session)
	document_links = collectDocumentLinks(session, top_level_links, course_id)

	choice = ""
	print(blue("Found the following document links:"))
	for count, document_link in enumerate(document_links):
	print("\t{}. {}".format(count, document_link))
	while not choice in ["y", "n"]:
	choice = input(blue("\nDownload them (y/n)?"))
	if choice == "y":
	createDownloadsRootDirectory()
	downloadDocuments(session, document_links)
	elif choice == "n":
	exit(0)

	print(green("\nMission complete!"))
	except requests.exceptions.ConnectionError:
	print(red("Failed."))
	print(yellow("Could not establish a stable connection with the library portal. The internet connection sucks as usual..."))
	exit(1)
	except PermissionError:
	print(red("Failed."))
	print(yellow("OS: Permission error."))
	exit(1)