|
#!/usr/bin/env python3 |
|
|
|
import os |
|
import sys |
|
import requests |
|
from getpass import getpass |
|
from typing import List |
|
from bs4 import BeautifulSoup |
|
|
|
|
|
### Script Variables |
|
FOLDER_NAME = "QuestionPapers" |
|
DOWNLOADS_ROOT_DIRECTORY = os.path.join(os.path.dirname(os.path.abspath(__file__)), FOLDER_NAME) |
|
|
|
INTRANET_BASE_URL = "http://library" |
|
INTERNET_BASE_URL = "http://www.bits-pilani.ac.in:12354" |
|
|
|
|
|
### Helper Functions |
|
def content(response: requests.models.Response) -> str: |
|
return response.content.decode("utf-8") |
|
|
|
def red(string: str) -> str: |
|
if os.name == "nt": |
|
# Compatibility layer for Windows: disable color. |
|
return string |
|
return "\033[91m{}\033[0m".format(string) |
|
|
|
def green(string: str) -> str: |
|
if os.name == "nt": |
|
return string |
|
return "\033[92m{}\033[0m".format(string) |
|
|
|
def blue(string: str) -> str: |
|
if os.name == "nt": |
|
return string |
|
return "\033[94m{}\033[0m".format(string) |
|
|
|
def yellow(string: str) -> str: |
|
if os.name == "nt": |
|
return string |
|
return "\033[93m{}\033[0m".format(string) |
|
|
|
def get_absolute_url(session: requests.Session, relative_url: str) -> str: |
|
if session.cookies.list_domains()[-1] == "library.local": |
|
base_url = INTRANET_BASE_URL |
|
else: |
|
base_url = INTERNET_BASE_URL |
|
return "{}/{}".format(base_url, relative_url) |
|
|
|
|
|
### Main Functions |
|
def obtainSessionToken(session: requests.Session) -> None: |
|
""" Get an unauthenticated session token from the server. Also perform a *simple* check to |
|
make sure that the user is connected to the BITS Network. This is not exactly meant to be |
|
a security measure and is more of a reminder. Phishing is still possible. The session token |
|
is PHPSESSID. """ |
|
|
|
print(blue("Attempting to establish an intranet connection..."), end=" ") |
|
response = session.get("{}/{}".format(INTRANET_BASE_URL, "login.php")) |
|
response_content = content(response)[-150::] |
|
if not "BITS-Library, BITS-Pilani, India." in response_content: |
|
print(red("Failed.")) |
|
else: |
|
print(green("Success.")) |
|
return |
|
|
|
print(blue("Attempting to establish an internet connection..."), end=" ") |
|
response = session.get(get_absolute_url("{}/{}".format(INTERNET_BASE_URL, "login.php"))) |
|
response_content = content(response)[-150::] |
|
if not "BITS-Library, BITS-Pilani, India." in response_content: |
|
print(red("Failed.")) |
|
print(red("Could not connect to the library portal.")) |
|
exit(1) |
|
else: |
|
print(green("Success.")) |
|
return |
|
|
|
def authenticateSession(session: requests.Session, username: str, password: str) -> None: |
|
print(blue("Attempting to authenticate session..."), end=" ") |
|
data = {"bitsid_psrn": username, "password": password, "submit": "Login"} |
|
response = session.post(get_absolute_url(session, "login.php"), data=data) |
|
response_content = content(response) |
|
if "Please Sign Up first then Login" in response_content: |
|
print(red("Falied.\nIncorrect username.")) |
|
exit(1) |
|
if "Invalid Credentials" in response_content: |
|
print(red("Failed.\nIncorrect password.")) |
|
exit(1) |
|
print(green("Success.")) |
|
return |
|
|
|
def collectTopLevelLinks(session: requests.Session) -> List[str]: |
|
""" Find all of the top level (main) links to crawl. The pages at these links will contain the question |
|
papers' links. """ |
|
print(blue("Attempting to collect top-level links..."), end=" ") |
|
response = session.get(get_absolute_url(session, "services/question_paper.php")) |
|
parser = BeautifulSoup(content(response), "html.parser") |
|
top_level_links_filter = lambda x: "Semester" in x.getText() |
|
top_level_links = [tag["href"] for tag in filter(top_level_links_filter, parser.find_all("a"))] |
|
print(green("Success.")) |
|
return top_level_links |
|
|
|
def collectDocumentLinks(session: requests.Session, top_level_links: List[str], course_id: str) -> List[str]: |
|
""" Go through each top level link and identify the lower level links to the documents we want to download. """ |
|
document_links = [] |
|
document_links_filter = lambda x: course_id == x.getText() |
|
print(blue("Analyzing top-level links to collect document links...")) |
|
for link in top_level_links: |
|
print(blue("Analyzing: \"{}\"...".format(link)), end=" ") |
|
response = session.post(get_absolute_url(session, link[3:])) |
|
try: |
|
response_content = content(response) |
|
parser = BeautifulSoup(response_content, "html.parser") |
|
for element in filter(document_links_filter, parser.find_all("td")): |
|
document_links.append([_ for _ in element.next_sibling.children][0]["href"]) |
|
print(green("OK.")) |
|
except UnicodeDecodeError: |
|
# TODO: find out what's causing this. |
|
print(red("Failed.".format(link))) |
|
if len(document_links) == 0: |
|
print(yellow("Found no matching documents.")) |
|
exit(0) |
|
return document_links |
|
|
|
def createDownloadsRootDirectory() -> None: |
|
""" Create the DOWNLOADS_ROOT_DIRECTORY if it does not exist. """ |
|
if not os.path.exists(DOWNLOADS_ROOT_DIRECTORY): |
|
print(yellow("Creating new directory: {}".format(DOWNLOADS_ROOT_DIRECTORY))) |
|
os.mkdir(DOWNLOADS_ROOT_DIRECTORY) |
|
|
|
def downloadDocuments(session: requests.Session, document_links: List[str]) -> None: |
|
""" Contact each of the lower level links to get the document it contains and save it. """ |
|
for link in document_links: |
|
link = get_absolute_url(session, link[3:]) |
|
filename = link.split("/")[-1] |
|
print(blue("Downloading \"{}\"...".format(filename)), end=" ") |
|
response = session.post(link) |
|
with open(os.path.join(DOWNLOADS_ROOT_DIRECTORY, filename), "wb+") as f: |
|
f.write(response.content) |
|
print(green("Success.")) |
|
|
|
|
|
### Script Runner |
|
if __name__ == "__main__": |
|
if not sys.version_info[0] == 3: |
|
print(yellow("*gasp* you're still using Python 2?! Use Python3!")) |
|
exit(1) |
|
session = requests.session() |
|
course_id = input(blue("course id: ")) # e.g. CS F469 |
|
username = input(blue("username: ")) |
|
if os.name == "nt": |
|
password = input(blue("password: ")) |
|
else: |
|
password = getpass(blue("password: ")) |
|
|
|
|
|
try: |
|
obtainSessionToken(session) |
|
authenticateSession(session, username, password) |
|
top_level_links = collectTopLevelLinks(session) |
|
document_links = collectDocumentLinks(session, top_level_links, course_id) |
|
|
|
choice = "" |
|
print(blue("Found the following document links:")) |
|
for count, document_link in enumerate(document_links): |
|
print("\t{}. {}".format(count, document_link)) |
|
while not choice in ["y", "n"]: |
|
choice = input(blue("\nDownload them (y/n)?")) |
|
if choice == "y": |
|
createDownloadsRootDirectory() |
|
downloadDocuments(session, document_links) |
|
elif choice == "n": |
|
exit(0) |
|
|
|
print(green("\nMission complete!")) |
|
except requests.exceptions.ConnectionError: |
|
print(red("Failed.")) |
|
print(yellow("Could not establish a stable connection with the library portal. The internet connection sucks as usual...")) |
|
exit(1) |
|
except PermissionError: |
|
print(red("Failed.")) |
|
print(yellow("OS: Permission error.")) |
|
exit(1) |