Last active
September 8, 2019 15:19
-
-
Save nmanumr/3ecb2c928bc5af382fda3ba9aaaf7e17 to your computer and use it in GitHub Desktop.
Script to parse COMSATS Lahore Timetable from PDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import calendar | |
import re | |
import pdfplumber | |
class TableCell: | |
def __init__(self, rect, text): | |
self.left = rect[0] | |
self.top = rect[1] | |
self.width = rect[2] - rect[0] | |
self.height = rect[3] - rect[1] | |
self.text = text | |
class TimeTableSlot: | |
TEXT_RE = re.compile(r"(\d)\n(\d{1,2}:\d{1,2})\s+-\s+(\d{1,2}:\d{1,2})", re.MULTILINE) | |
def __init__(self, cell): | |
self.left = cell.left | |
self.slot, self.startTime, self.endTime = re.match(self.TEXT_RE, cell.text.strip()).groups() | |
class TimeTableLecture: | |
def __init__(self, cell, slots, weekday_name): | |
self.weekday = self.weekday_from_name(weekday_name) | |
data = cell.text.split("\n") | |
self.has_double_slot = False | |
if cell.width > 200: | |
self.has_double_slot = True | |
for index, slot in enumerate(slots): | |
if int(slot.left) == int(cell.left): | |
self.slots = [slot] | |
if self.has_double_slot: | |
self.slots.append(slots[index + 1]) | |
if len(data) == 3: | |
self.teacher, self.subject, self.room = data | |
elif len(data) == 2: | |
self.subject, self.room = data | |
else: | |
raise NotImplementedError("Multi line teacher/subject name not handled yet. Cell text is: \n%s" % cell.text) | |
@staticmethod | |
def weekday_from_name(weekday_name): | |
weekdays = ["mo", "tu", "we", "th", "fr", "sa", "su"] | |
if weekday_name.lower() in weekdays: | |
return weekdays.index(weekday_name.lower()) | |
@property | |
def weekday_name(self): | |
return calendar.day_abbr[self.weekday] | |
class TimeTable: | |
def __init__(self, page): | |
self.page = page | |
self.class_name = self._get_class_name() | |
def parse_timetable(self): | |
self._parse_timetable_cells() | |
self._parse_timetable_slots() | |
self._parse_timetable_slots() | |
self._parse_lectures() | |
def _text_in_rect(self, rect): | |
return self.page.crop(rect).extract_text() | |
def _get_class_name(self): | |
return self._text_in_rect((70, 30, self.page.width, 60)) | |
def _parse_timetable_cells(self): | |
table = self.page.find_tables()[0] | |
self._rows = [] | |
for row in table.rows[1:]: | |
cells = [] | |
for cell in row.cells: | |
if not cell: | |
continue | |
cells.append(TableCell(cell, self._text_in_rect(cell))) | |
self._rows.append(cells) | |
def _parse_timetable_slots(self): | |
self.slots = [] | |
for cell in self._rows[0]: | |
if cell.text: | |
self.slots.append(TimeTableSlot(cell)) | |
def _parse_lectures(self): | |
self.lectures = [] | |
last_day = "" | |
for row in self._rows[1:]: | |
for cell in row: | |
if not cell.text: | |
continue | |
elif len(cell.text.strip()) == 2: | |
last_day = cell.text.strip() | |
else: | |
self.lectures.append(TimeTableLecture(cell, self.slots, last_day)) | |
def print_time_table(timetable): | |
for lec in timetable.lectures: | |
print(f"{lec.weekday_name} - {lec.slots[0].slot} - {lec.subject} ({lec.room})") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-f", "--file", required=True, help="PDF file to parse") | |
parser.add_argument("-c", "--classes", help='Classes name like (FA18-BCS-C)', nargs="+") | |
parser.add_argument("-p", "--pages", help='Pages to parse', nargs="+", type=int) | |
args = vars(parser.parse_args()) | |
timetables = [] | |
with pdfplumber.open(args["file"]) as pdf: | |
if args["classes"]: | |
for page in pdf.pages: | |
timetable = TimeTable(page) | |
for klass in args["classes"]: | |
if klass in timetable.class_name: | |
timetable.parse_timetable() | |
timetables.append(timetable) | |
if args["pages"]: | |
for page in args["pages"]: | |
timetable = TimeTable(pdf.pages[page - 1]) | |
timetable.parse_timetable() | |
timetables.append(timetable) | |
for timetable in timetables: | |
print("\n", timetable.class_name) | |
print_time_table(timetable) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Running the script
Features and Limitations
output: