Skip to content

Instantly share code, notes, and snippets.

@nmanumr
Last active September 8, 2019 15:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nmanumr/3ecb2c928bc5af382fda3ba9aaaf7e17 to your computer and use it in GitHub Desktop.
Save nmanumr/3ecb2c928bc5af382fda3ba9aaaf7e17 to your computer and use it in GitHub Desktop.
Script to parse COMSATS Lahore Timetable from PDF
import argparse
import calendar
import re
import pdfplumber
class TableCell:
def __init__(self, rect, text):
self.left = rect[0]
self.top = rect[1]
self.width = rect[2] - rect[0]
self.height = rect[3] - rect[1]
self.text = text
class TimeTableSlot:
TEXT_RE = re.compile(r"(\d)\n(\d{1,2}:\d{1,2})\s+-\s+(\d{1,2}:\d{1,2})", re.MULTILINE)
def __init__(self, cell):
self.left = cell.left
self.slot, self.startTime, self.endTime = re.match(self.TEXT_RE, cell.text.strip()).groups()
class TimeTableLecture:
def __init__(self, cell, slots, weekday_name):
self.weekday = self.weekday_from_name(weekday_name)
data = cell.text.split("\n")
self.has_double_slot = False
if cell.width > 200:
self.has_double_slot = True
for index, slot in enumerate(slots):
if int(slot.left) == int(cell.left):
self.slots = [slot]
if self.has_double_slot:
self.slots.append(slots[index + 1])
if len(data) == 3:
self.teacher, self.subject, self.room = data
elif len(data) == 2:
self.subject, self.room = data
else:
raise NotImplementedError("Multi line teacher/subject name not handled yet. Cell text is: \n%s" % cell.text)
@staticmethod
def weekday_from_name(weekday_name):
weekdays = ["mo", "tu", "we", "th", "fr", "sa", "su"]
if weekday_name.lower() in weekdays:
return weekdays.index(weekday_name.lower())
@property
def weekday_name(self):
return calendar.day_abbr[self.weekday]
class TimeTable:
def __init__(self, page):
self.page = page
self.class_name = self._get_class_name()
def parse_timetable(self):
self._parse_timetable_cells()
self._parse_timetable_slots()
self._parse_timetable_slots()
self._parse_lectures()
def _text_in_rect(self, rect):
return self.page.crop(rect).extract_text()
def _get_class_name(self):
return self._text_in_rect((70, 30, self.page.width, 60))
def _parse_timetable_cells(self):
table = self.page.find_tables()[0]
self._rows = []
for row in table.rows[1:]:
cells = []
for cell in row.cells:
if not cell:
continue
cells.append(TableCell(cell, self._text_in_rect(cell)))
self._rows.append(cells)
def _parse_timetable_slots(self):
self.slots = []
for cell in self._rows[0]:
if cell.text:
self.slots.append(TimeTableSlot(cell))
def _parse_lectures(self):
self.lectures = []
last_day = ""
for row in self._rows[1:]:
for cell in row:
if not cell.text:
continue
elif len(cell.text.strip()) == 2:
last_day = cell.text.strip()
else:
self.lectures.append(TimeTableLecture(cell, self.slots, last_day))
def print_time_table(timetable):
for lec in timetable.lectures:
print(f"{lec.weekday_name} - {lec.slots[0].slot} - {lec.subject} ({lec.room})")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--file", required=True, help="PDF file to parse")
parser.add_argument("-c", "--classes", help='Classes name like (FA18-BCS-C)', nargs="+")
parser.add_argument("-p", "--pages", help='Pages to parse', nargs="+", type=int)
args = vars(parser.parse_args())
timetables = []
with pdfplumber.open(args["file"]) as pdf:
if args["classes"]:
for page in pdf.pages:
timetable = TimeTable(page)
for klass in args["classes"]:
if klass in timetable.class_name:
timetable.parse_timetable()
timetables.append(timetable)
if args["pages"]:
for page in args["pages"]:
timetable = TimeTable(pdf.pages[page - 1])
timetable.parse_timetable()
timetables.append(timetable)
for timetable in timetables:
print("\n", timetable.class_name)
print_time_table(timetable)
@nmanumr
Copy link
Author

nmanumr commented Sep 8, 2019

Running the script

# install dependencies
sudo pip3 install pdfplumber

# running parser with the class name
python3 timetableParser.py -f sample.pdf -c FA18-BCS-C

# running parser with the page number
python3 timetableParser.py -f sample.pdf -p 137

Features and Limitations

  • Can handle complex timetable like:
    image

output:

 FA16-CHE-B (Semester 7)
Mon - 4 - Process Control (N-1)
Mon - 5 - Petrochemical Engineering (G1 N-1)
Mon - 6 - Polymer Engineering (N-2)
Mon - 5 - Waste Management (G2 N-2)
Mon - 5 - Industrial Energy Systems (G3 D-117)
Tue - 3 - Process Modelling and Simulation (N-2)
Tue - 4 - Chemical Engineering Plant Design (D-114)
Tue - 5 - Lab-Process Modelling and Simulation (PC Lab-Pharmacy)
Tue - 6 - Process Instrumentation (N-1)
Wed - 3 - Lab-Process Control (G1 Che Lab-13)
Wed - 5 - Petrochemical Engineering (G1 N-1)
Wed - 6 - Process Modelling and Simulation (PC Lab-Pharmacy)
Wed - 1 - Lab-Process Control (G2 Che Lab-13)
Wed - 5 - Waste Management (G2 N-2)
Wed - 5 - Industrial Energy Systems (G3 D-117)
Thu - 3 - Polymer Engineering (N-12)
Thu - 5 - Lab-Process Modelling and Simulation (PC Lab-Pharmacy)
Thu - 6 - Chemical Engineering Plant Design (N-3)
Fri - 3 - Process Instrumentation (D-117)
Fri - 5 - Process Control (N-2)
  • Can't handle multi-line text for teacher and subject name yet. Like:
    image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment