Skip to content

Instantly share code, notes, and snippets.

@pgaskin
Created August 12, 2022 19:27
Show Gist options
  • Save pgaskin/db89cbf40e64ac34dd054cf3dff58bf8 to your computer and use it in GitHub Desktop.
Save pgaskin/db89cbf40e64ac34dd054cf3dff58bf8 to your computer and use it in GitHub Desktop.
// Package grdist parses Queen's University SOLUS (Oracle PeopleSoft Student
// Records 9) Grade Distribution Reports.
//
// Works as of August 2022.
package grdist
import (
"bytes"
"errors"
"fmt"
"io"
"math"
"os"
"regexp"
"strconv"
"strings"
"time"
"rsc.io/pdf"
)
// MaxSize is the maximum size of PDF to accept.
var MaxSize int64 = 8192
// Report represents the contents of a Queen's University SOLUS (Oracle
// PeopleSoft Student Records 9) Grade Distribution Report.
type Report struct {
AsOf time.Time
StudentName string
StudentNumber int
Institution string
Career string
Term string
Course []Course
}
func (r Report) String() string {
var b strings.Builder
b.WriteString("Grade Distribution Report\n\n")
if !r.AsOf.IsZero() {
b.WriteString("Date: ")
b.WriteString(r.AsOf.Format("2006/01/02"))
b.WriteByte('\n')
}
if r.StudentName != "" || r.StudentNumber != 0 {
b.WriteString("Student: ")
if r.StudentName != "" {
b.WriteString(r.StudentName)
}
if r.StudentName != "" && r.StudentNumber != 0 {
b.WriteByte(' ')
}
if r.StudentNumber != 0 {
b.WriteByte('#')
b.WriteString(strconv.Itoa(r.StudentNumber))
}
b.WriteByte('\n')
}
if r.Institution != "" {
b.WriteString("Institution: ")
b.WriteString(r.Institution)
b.WriteByte('\n')
}
if r.Career != "" {
b.WriteString("Academic Career: ")
b.WriteString(r.Career)
b.WriteByte('\n')
}
if r.Term != "" {
b.WriteString("Term: ")
b.WriteString(r.Term)
b.WriteByte('\n')
}
for g := Grade(0); g < MaxGrade; g++ {
if g != 0 {
b.WriteByte(' ')
} else {
b.WriteString("\n # [")
}
b.WriteString(fmt.Sprintf("%-2s", g))
}
b.WriteString("]\n")
for _, c := range r.Course {
b.WriteString(fmt.Sprintf("%4d %s (%8s) %s", c.Enrollment, c.Distribution, c.Name, c.Description))
b.WriteByte('\n')
}
return b.String()
}
// Course contains information about a course
type Course struct {
Name string
Description string
Enrollment int
Distribution Distribution
}
func (c Course) String() string {
return fmt.Sprintf("{%8s: %4d%s %q}", c.Name, c.Enrollment, c.Distribution, c.Description)
}
// Distribution is the grade distribution.
type Distribution [MaxGrade]int
func (d Distribution) String() string {
var b strings.Builder
b.WriteByte('[')
for v := Grade(0); v < MaxGrade; v++ {
if v != 0 {
b.WriteByte(' ')
}
if d[v] < 10 {
b.WriteByte(' ')
}
b.WriteString(strconv.Itoa(d[v]))
}
b.WriteByte(']')
return b.String()
}
func (d Distribution) GoString() string {
var b strings.Builder
b.WriteString("Distribution{")
for v := Grade(0); v < MaxGrade; v++ {
if v != 0 {
b.WriteByte(',')
b.WriteByte(' ')
}
b.WriteString(v.GoString())
b.WriteByte(':')
b.WriteByte(' ')
b.WriteString(strconv.Itoa(d[v]))
}
b.WriteByte('}')
return b.String()
}
type Grade int
const (
GradeAp Grade = iota
GradeA
GradeAm
GradeBp
GradeB
GradeBm
GradeCp
GradeC
GradeCm
GradeDp
GradeD
GradeDm
GradeF
MaxGrade
)
func ParseGrade(s string) (Grade, error) {
var g Grade
err := g.UnmarshalText([]byte(s))
return g, err
}
func (g Grade) MarshalText() ([]byte, error) {
if g < 0 || g >= MaxGrade {
return nil, fmt.Errorf("unknown grade %d", g)
}
return []byte(g.String()), nil
}
func (g *Grade) UnmarshalText(b []byte) error {
for v := Grade(0); v < MaxGrade; v++ {
if bytes.Equal([]byte(v.String()), b) {
*g = v
return nil
}
}
return fmt.Errorf("unknown grade %q", string(b))
}
func (g Grade) String() string {
switch g {
case GradeAp:
return "A+"
case GradeA:
return "A"
case GradeAm:
return "A-"
case GradeBp:
return "B+"
case GradeB:
return "B"
case GradeBm:
return "B-"
case GradeCp:
return "C+"
case GradeC:
return "C"
case GradeCm:
return "C-"
case GradeDp:
return "D+"
case GradeD:
return "D"
case GradeDm:
return "D-"
case GradeF:
return "F"
}
return "Invalid"
}
func (g Grade) GoString() string {
switch g {
case GradeAp:
return "GradeAp"
case GradeA:
return "GradeA"
case GradeAm:
return "GradeAm"
case GradeBp:
return "GradeBp"
case GradeB:
return "GradeB"
case GradeBm:
return "GradeBm"
case GradeCp:
return "GradeCp"
case GradeC:
return "GradeC"
case GradeCm:
return "GradeCm"
case GradeDp:
return "GradeDp"
case GradeD:
return "GradeD"
case GradeDm:
return "GradeDm"
case GradeF:
return "GradeF"
}
return fmt.Sprintf("Grade(%d)", g)
}
var ErrParse = errors.New("parse error")
// Parse parses a PDF grade distribution report from r. If the returned error
// is ErrParse, the PDF contents were unable to be parsed. If the returned
// error is anything else, a problem occured reading the PDF itself.
func Parse(r io.ReaderAt, size int64) (*Report, error) {
if size > MaxSize {
return nil, fmt.Errorf("%w: not a grade distribution: pdf is too large (maximum %d)", ErrParse, MaxSize)
}
pr, err := pdf.NewReader(r, size)
if err != nil {
return nil, err
}
return ParsePDF(pr)
}
// ParsePDF is like Parse, but takes an existing *pdf.Reader and does not check
// the file size.
func ParsePDF(pr *pdf.Reader) (*Report, error) {
return parse(pr)
}
// ReadPDF is like Parse, but opens a file.
func ReadPDF(name string) (*Report, error) {
f, err := os.Open(name)
if err != nil {
return nil, err
}
defer f.Close()
sz, err := f.Seek(0, io.SeekEnd)
if err != nil {
return nil, err
}
return Parse(f, sz)
}
var (
reAsOf = regexp.MustCompile(`^As of (([0-9][0-9]?) (January|February|March|April|May|June|July|August|September|October|November|December) (2[0-9]{3}))$`)
reTerm = regexp.MustCompile(`^((2[0-9]{3}) (Fall|Winter|Summer))$`)
reCourse = regexp.MustCompile(`^([A-Z]+)$`)
)
// ParsePDF is like Parse, but takes an existing *pdf.Reader.
func parse(pr *pdf.Reader) (*Report, error) {
r := &Report{}
if pr.NumPage() == 0 {
return nil, fmt.Errorf("%w: no pages", ErrParse)
}
tkz := newPDFTokenizer(pr.Page(1).V.Key("Contents"))
defer tkz.Close()
if tk, err := tkz.Token(); err == io.EOF {
return r, fmt.Errorf("%w: not a course grade distribution (no text on first page)", ErrParse)
} else if err != nil {
return r, fmt.Errorf("read first text token: %w", err)
} else if tk.Tj != "Course Grade Distribution" {
return r, fmt.Errorf("%w: not a course grade distribution (got %q as the first text token)", ErrParse, tk.Tj)
}
if tk, err := tkz.Token(); err != nil && err != io.EOF {
return r, fmt.Errorf("read as of date: %w", err)
} else if m := reAsOf.FindStringSubmatch(tk.Tj); m == nil {
return r, fmt.Errorf("%w: invalid as of date (got %q)", ErrParse, tk.Tj)
} else if d, err := time.ParseInLocation("2 January 2006", strings.TrimPrefix(m[1], "0"), time.UTC); err != nil {
return r, fmt.Errorf("%w: invalid as of date: %v", ErrParse, err)
} else {
r.AsOf = d
}
if tks, err := tkz.Tokens(2); err == io.ErrUnexpectedEOF {
return nil, fmt.Errorf("%w: incomplete student info", ErrParse)
} else if err != nil {
return nil, fmt.Errorf("read student info: %w", err)
} else if tks[0].TmY != tks[1].TmY {
return nil, fmt.Errorf("%w: expected student name/number to be on the same line, got %v %v", ErrParse, tks[0].TmY, tks[1].TmY)
} else if len(tks[0].Tj) == 0 {
return nil, fmt.Errorf("%w: invalid student name", ErrParse)
} else if len(tks[1].Tj) != 8 {
return nil, fmt.Errorf("%w: invalid student number", ErrParse)
} else if n, err := strconv.ParseUint(tks[1].Tj, 10, 64); err != nil {
return nil, fmt.Errorf("%w: invalid student number", ErrParse)
} else {
r.StudentName = tks[0].Tj
r.StudentNumber = int(n)
}
if tks, err := tkz.Tokens(2); err == io.ErrUnexpectedEOF {
return nil, fmt.Errorf("%w: incomplete institution", ErrParse)
} else if err != nil {
return nil, fmt.Errorf("read institution: %w", err)
} else if tks[0].Tj != "Institution:" {
return nil, fmt.Errorf("%w: invalid institution (0: %q)", ErrParse, tks[0].Tj)
} else if v := strings.TrimSpace(tks[1].Tj); v != "Queen's University" {
return nil, fmt.Errorf("%w: unknown institution (got %q)", ErrParse, tks[1].Tj)
} else {
r.Institution = v
}
if tks, err := tkz.Tokens(2); err == io.ErrUnexpectedEOF {
return nil, fmt.Errorf("%w: incomplete career", ErrParse)
} else if err != nil {
return nil, fmt.Errorf("read career: %w", err)
} else if tks[0].Tj != "Academic Career:" {
return nil, fmt.Errorf("%w: invalid career (0: %q)", ErrParse, tks[0].Tj)
} else if v := strings.TrimSpace(tks[1].Tj); len(v) == 0 {
return nil, fmt.Errorf("%w: unknown career (got %q)", ErrParse, tks[1].Tj)
} else {
r.Career = v
}
if tks, err := tkz.Tokens(2); err == io.ErrUnexpectedEOF {
return nil, fmt.Errorf("%w: incomplete term", ErrParse)
} else if err != nil {
return nil, fmt.Errorf("read term: %w", err)
} else if tks[0].Tj != "Term:" {
return nil, fmt.Errorf("%w: invalid term (0: %q)", ErrParse, tks[0].Tj)
} else if m := reTerm.FindStringSubmatch(tks[1].Tj); m == nil {
return nil, fmt.Errorf("%w: invalid term (got %q)", ErrParse, tks[1].Tj)
} else {
r.Term = m[1]
}
if tk, err := tkz.Token(); err != nil && err != io.EOF {
return r, fmt.Errorf("read minimum enrollment message: %w", err)
} else if v := strings.TrimSpace(tk.Tj); v != "* Data is not displayed for courses with fewer than 10 enrollments." {
return r, fmt.Errorf("%w: expected minimum enrollment message, got %q", ErrParse, v)
}
if tks, err := tkz.Tokens(3 + int(MaxGrade)); err == io.ErrUnexpectedEOF {
return nil, fmt.Errorf("%w: incomplete table header", ErrParse)
} else if err != nil {
return nil, fmt.Errorf("read table header: %w", err)
} else {
for i, tk := range tks {
if tk.TmY != tks[0].TmY {
return nil, fmt.Errorf("%w: got %d table column headers on the same line, expected more", ErrParse, i)
}
var x string
switch i {
case 0:
x = "Course"
case 1:
x = "Description"
case 2:
x = "Enrollment"
default:
x = Grade(i - 3).String()
}
if tk.Tj != x {
return nil, fmt.Errorf("%w: expected table header %d to be %q (got %q)", ErrParse, i+1, x, tk.Tj)
}
}
}
for ry := math.NaN(); ; {
var c Course
if tk, err := tkz.Token(); err == io.EOF {
break
} else if err != nil {
return r, fmt.Errorf("read next row course header part 1: %w", err)
} else if tk.TmY == ry {
return r, fmt.Errorf("%w: read next course header part 1: unexpected text %q on same line as previous row", ErrParse, tk.Tj)
} else if m := reCourse.FindStringSubmatch(tk.Tj); m == nil {
return r, fmt.Errorf("%w: invalid course header part 1 (got %q)", ErrParse, tk.Tj)
} else {
c.Name = m[1]
ry = tk.TmY
}
if tk, err := tkz.Token(); err == io.EOF {
return r, fmt.Errorf("%w: missing row course header part 2", ErrParse)
} else if err != nil {
return r, fmt.Errorf("read next row course header part 2: %w", err)
} else if tk.TmY != ry {
return r, fmt.Errorf("%w: read next course header part 2: expected y position %v, got %s", ErrParse, ry, tk)
} else if v := strings.TrimSpace(tk.Tj); len(v) == 0 {
return r, fmt.Errorf("%w: invalid course header part 2 (got %q)", ErrParse, v)
} else {
c.Name += " " + v
}
if tk, err := tkz.Token(); err == io.EOF {
return r, fmt.Errorf("%w: missing row desc header first value", ErrParse)
} else if err != nil {
return r, fmt.Errorf("read row desc header first value: %w", err)
} else if tk.TmY != ry {
return r, fmt.Errorf("%w: read next desc header first value: expected y position %v, got %s", ErrParse, ry, tk)
} else if v := strings.TrimSpace(tk.Tj); len(v) == 0 {
return r, fmt.Errorf("%w: invalid desc header first value (got %q)", ErrParse, v)
} else {
c.Description = v
}
for {
if tk, err := tkz.Token(); err == io.EOF {
return r, fmt.Errorf("%w: missing row enrollment header (after description %q)", ErrParse, c.Description)
} else if err != nil {
return r, fmt.Errorf("read next row desc header value or enrollment header: %w", err)
} else if tk.TmY == ry {
if v, err := strconv.ParseUint(tk.Tj, 10, 64); err != nil {
return r, fmt.Errorf("%w: invalid row enrollment header (got %q)", ErrParse, tk.Tj)
} else {
c.Enrollment = int(v)
break
}
} else {
if v := strings.TrimSpace(tk.Tj); len(v) != 0 {
c.Description += " " + v
}
}
}
for g := Grade(0); g < MaxGrade; g++ {
if tk, err := tkz.Token(); err != nil {
return r, fmt.Errorf("read distribution value for grade %s: %w", g, err)
} else if v, err := strconv.ParseUint(tk.Tj, 10, 64); err != nil {
return r, fmt.Errorf("%w: invalid distribution value for grade %s (got %q)", ErrParse, g, tk.Tj)
} else if v > 100 {
return r, fmt.Errorf("%w: distribution value out of range for grade %s (got %q)", ErrParse, g, tk.Tj)
} else {
c.Distribution[g] = int(v)
}
}
r.Course = append(r.Course, c)
}
if len(r.Course) == 0 {
return r, fmt.Errorf("%w: no courses found in report", ErrParse)
}
return r, nil
}
type pdfTokenizer struct {
err error
errCh chan error
token chan pdfToken
cancel chan struct{}
}
type pdfToken struct {
Tj string
TmX float64 // the last Tm X value
TmY float64 // the last Tm Y value
}
func (t pdfToken) String() string {
return fmt.Sprintf("%q @ (%v, %v)", t.Tj, t.TmX, t.TmY)
}
func newPDFTokenizer(strm pdf.Value) *pdfTokenizer {
tkz := &pdfTokenizer{
errCh: make(chan error, 1),
token: make(chan pdfToken),
cancel: make(chan struct{}),
}
go func() {
defer close(tkz.token)
defer func() {
if x := recover(); x != nil {
switch x := x.(type) {
case error:
tkz.errCh <- fmt.Errorf("pdf parse error: %w", x)
default:
tkz.errCh <- fmt.Errorf("pdf parse error: %v", x)
}
close(tkz.errCh)
}
}()
if strm.Kind() != pdf.Stream {
panic("contents not a stream")
}
var tk pdfToken
pdf.Interpret(strm, func(stk *pdf.Stack, op string) {
select {
case <-tkz.cancel:
panic("reader closed")
default:
}
switch op {
case "Tm":
if v := stk.Pop(); v.Kind() != pdf.Real && v.Kind() != pdf.Integer {
panic("Tm instruction vertical translation invalid or missing")
} else {
tk.TmY = v.Float64()
}
if v := stk.Pop(); v.Kind() != pdf.Real && v.Kind() != pdf.Integer {
panic("Tm instruction horizontal translation invalid or missing")
} else {
tk.TmX = v.Float64()
}
case "Tj":
if v := stk.Pop(); v.Kind() != pdf.String {
panic("Tj instruction text invalid or missing")
} else {
tk.Tj = v.Text()
}
}
for stk.Len() != 0 {
stk.Pop()
}
if op == "Tj" {
select {
case <-tkz.cancel:
panic("reader closed")
case tkz.token <- tk:
}
}
})
}()
return tkz
}
func (t *pdfTokenizer) Token() (pdfToken, error) {
if t.err != nil {
return pdfToken{}, t.err
}
tk, ok := <-t.token
if !ok {
select {
case err := <-t.errCh:
t.err = err
default:
t.err = io.EOF
}
return pdfToken{}, t.err
}
return tk, nil
}
func (t *pdfTokenizer) Tokens(n int) ([]pdfToken, error) {
tks := make([]pdfToken, 0, n)
for i := 0; i < n; i++ {
tk, err := t.Token()
if err == io.EOF {
err = io.ErrUnexpectedEOF
}
if err != nil {
return tks, err
}
tks = append(tks, tk)
}
return tks, nil
}
func (t *pdfTokenizer) Close() {
close(t.cancel)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment