Created
August 12, 2022 19:27
-
-
Save pgaskin/db89cbf40e64ac34dd054cf3dff58bf8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Package grdist parses Queen's University SOLUS (Oracle PeopleSoft Student | |
// Records 9) Grade Distribution Reports. | |
// | |
// Works as of August 2022. | |
package grdist | |
import ( | |
"bytes" | |
"errors" | |
"fmt" | |
"io" | |
"math" | |
"os" | |
"regexp" | |
"strconv" | |
"strings" | |
"time" | |
"rsc.io/pdf" | |
) | |
// MaxSize is the maximum size of PDF to accept. | |
var MaxSize int64 = 8192 | |
// Report represents the contents of a Queen's University SOLUS (Oracle | |
// PeopleSoft Student Records 9) Grade Distribution Report. | |
type Report struct { | |
AsOf time.Time | |
StudentName string | |
StudentNumber int | |
Institution string | |
Career string | |
Term string | |
Course []Course | |
} | |
func (r Report) String() string { | |
var b strings.Builder | |
b.WriteString("Grade Distribution Report\n\n") | |
if !r.AsOf.IsZero() { | |
b.WriteString("Date: ") | |
b.WriteString(r.AsOf.Format("2006/01/02")) | |
b.WriteByte('\n') | |
} | |
if r.StudentName != "" || r.StudentNumber != 0 { | |
b.WriteString("Student: ") | |
if r.StudentName != "" { | |
b.WriteString(r.StudentName) | |
} | |
if r.StudentName != "" && r.StudentNumber != 0 { | |
b.WriteByte(' ') | |
} | |
if r.StudentNumber != 0 { | |
b.WriteByte('#') | |
b.WriteString(strconv.Itoa(r.StudentNumber)) | |
} | |
b.WriteByte('\n') | |
} | |
if r.Institution != "" { | |
b.WriteString("Institution: ") | |
b.WriteString(r.Institution) | |
b.WriteByte('\n') | |
} | |
if r.Career != "" { | |
b.WriteString("Academic Career: ") | |
b.WriteString(r.Career) | |
b.WriteByte('\n') | |
} | |
if r.Term != "" { | |
b.WriteString("Term: ") | |
b.WriteString(r.Term) | |
b.WriteByte('\n') | |
} | |
for g := Grade(0); g < MaxGrade; g++ { | |
if g != 0 { | |
b.WriteByte(' ') | |
} else { | |
b.WriteString("\n # [") | |
} | |
b.WriteString(fmt.Sprintf("%-2s", g)) | |
} | |
b.WriteString("]\n") | |
for _, c := range r.Course { | |
b.WriteString(fmt.Sprintf("%4d %s (%8s) %s", c.Enrollment, c.Distribution, c.Name, c.Description)) | |
b.WriteByte('\n') | |
} | |
return b.String() | |
} | |
// Course contains information about a course | |
type Course struct { | |
Name string | |
Description string | |
Enrollment int | |
Distribution Distribution | |
} | |
func (c Course) String() string { | |
return fmt.Sprintf("{%8s: %4d%s %q}", c.Name, c.Enrollment, c.Distribution, c.Description) | |
} | |
// Distribution is the grade distribution. | |
type Distribution [MaxGrade]int | |
func (d Distribution) String() string { | |
var b strings.Builder | |
b.WriteByte('[') | |
for v := Grade(0); v < MaxGrade; v++ { | |
if v != 0 { | |
b.WriteByte(' ') | |
} | |
if d[v] < 10 { | |
b.WriteByte(' ') | |
} | |
b.WriteString(strconv.Itoa(d[v])) | |
} | |
b.WriteByte(']') | |
return b.String() | |
} | |
func (d Distribution) GoString() string { | |
var b strings.Builder | |
b.WriteString("Distribution{") | |
for v := Grade(0); v < MaxGrade; v++ { | |
if v != 0 { | |
b.WriteByte(',') | |
b.WriteByte(' ') | |
} | |
b.WriteString(v.GoString()) | |
b.WriteByte(':') | |
b.WriteByte(' ') | |
b.WriteString(strconv.Itoa(d[v])) | |
} | |
b.WriteByte('}') | |
return b.String() | |
} | |
type Grade int | |
const ( | |
GradeAp Grade = iota | |
GradeA | |
GradeAm | |
GradeBp | |
GradeB | |
GradeBm | |
GradeCp | |
GradeC | |
GradeCm | |
GradeDp | |
GradeD | |
GradeDm | |
GradeF | |
MaxGrade | |
) | |
func ParseGrade(s string) (Grade, error) { | |
var g Grade | |
err := g.UnmarshalText([]byte(s)) | |
return g, err | |
} | |
func (g Grade) MarshalText() ([]byte, error) { | |
if g < 0 || g >= MaxGrade { | |
return nil, fmt.Errorf("unknown grade %d", g) | |
} | |
return []byte(g.String()), nil | |
} | |
func (g *Grade) UnmarshalText(b []byte) error { | |
for v := Grade(0); v < MaxGrade; v++ { | |
if bytes.Equal([]byte(v.String()), b) { | |
*g = v | |
return nil | |
} | |
} | |
return fmt.Errorf("unknown grade %q", string(b)) | |
} | |
func (g Grade) String() string { | |
switch g { | |
case GradeAp: | |
return "A+" | |
case GradeA: | |
return "A" | |
case GradeAm: | |
return "A-" | |
case GradeBp: | |
return "B+" | |
case GradeB: | |
return "B" | |
case GradeBm: | |
return "B-" | |
case GradeCp: | |
return "C+" | |
case GradeC: | |
return "C" | |
case GradeCm: | |
return "C-" | |
case GradeDp: | |
return "D+" | |
case GradeD: | |
return "D" | |
case GradeDm: | |
return "D-" | |
case GradeF: | |
return "F" | |
} | |
return "Invalid" | |
} | |
func (g Grade) GoString() string { | |
switch g { | |
case GradeAp: | |
return "GradeAp" | |
case GradeA: | |
return "GradeA" | |
case GradeAm: | |
return "GradeAm" | |
case GradeBp: | |
return "GradeBp" | |
case GradeB: | |
return "GradeB" | |
case GradeBm: | |
return "GradeBm" | |
case GradeCp: | |
return "GradeCp" | |
case GradeC: | |
return "GradeC" | |
case GradeCm: | |
return "GradeCm" | |
case GradeDp: | |
return "GradeDp" | |
case GradeD: | |
return "GradeD" | |
case GradeDm: | |
return "GradeDm" | |
case GradeF: | |
return "GradeF" | |
} | |
return fmt.Sprintf("Grade(%d)", g) | |
} | |
var ErrParse = errors.New("parse error") | |
// Parse parses a PDF grade distribution report from r. If the returned error | |
// is ErrParse, the PDF contents were unable to be parsed. If the returned | |
// error is anything else, a problem occured reading the PDF itself. | |
func Parse(r io.ReaderAt, size int64) (*Report, error) { | |
if size > MaxSize { | |
return nil, fmt.Errorf("%w: not a grade distribution: pdf is too large (maximum %d)", ErrParse, MaxSize) | |
} | |
pr, err := pdf.NewReader(r, size) | |
if err != nil { | |
return nil, err | |
} | |
return ParsePDF(pr) | |
} | |
// ParsePDF is like Parse, but takes an existing *pdf.Reader and does not check | |
// the file size. | |
func ParsePDF(pr *pdf.Reader) (*Report, error) { | |
return parse(pr) | |
} | |
// ReadPDF is like Parse, but opens a file. | |
func ReadPDF(name string) (*Report, error) { | |
f, err := os.Open(name) | |
if err != nil { | |
return nil, err | |
} | |
defer f.Close() | |
sz, err := f.Seek(0, io.SeekEnd) | |
if err != nil { | |
return nil, err | |
} | |
return Parse(f, sz) | |
} | |
var ( | |
reAsOf = regexp.MustCompile(`^As of (([0-9][0-9]?) (January|February|March|April|May|June|July|August|September|October|November|December) (2[0-9]{3}))$`) | |
reTerm = regexp.MustCompile(`^((2[0-9]{3}) (Fall|Winter|Summer))$`) | |
reCourse = regexp.MustCompile(`^([A-Z]+)$`) | |
) | |
// ParsePDF is like Parse, but takes an existing *pdf.Reader. | |
func parse(pr *pdf.Reader) (*Report, error) { | |
r := &Report{} | |
if pr.NumPage() == 0 { | |
return nil, fmt.Errorf("%w: no pages", ErrParse) | |
} | |
tkz := newPDFTokenizer(pr.Page(1).V.Key("Contents")) | |
defer tkz.Close() | |
if tk, err := tkz.Token(); err == io.EOF { | |
return r, fmt.Errorf("%w: not a course grade distribution (no text on first page)", ErrParse) | |
} else if err != nil { | |
return r, fmt.Errorf("read first text token: %w", err) | |
} else if tk.Tj != "Course Grade Distribution" { | |
return r, fmt.Errorf("%w: not a course grade distribution (got %q as the first text token)", ErrParse, tk.Tj) | |
} | |
if tk, err := tkz.Token(); err != nil && err != io.EOF { | |
return r, fmt.Errorf("read as of date: %w", err) | |
} else if m := reAsOf.FindStringSubmatch(tk.Tj); m == nil { | |
return r, fmt.Errorf("%w: invalid as of date (got %q)", ErrParse, tk.Tj) | |
} else if d, err := time.ParseInLocation("2 January 2006", strings.TrimPrefix(m[1], "0"), time.UTC); err != nil { | |
return r, fmt.Errorf("%w: invalid as of date: %v", ErrParse, err) | |
} else { | |
r.AsOf = d | |
} | |
if tks, err := tkz.Tokens(2); err == io.ErrUnexpectedEOF { | |
return nil, fmt.Errorf("%w: incomplete student info", ErrParse) | |
} else if err != nil { | |
return nil, fmt.Errorf("read student info: %w", err) | |
} else if tks[0].TmY != tks[1].TmY { | |
return nil, fmt.Errorf("%w: expected student name/number to be on the same line, got %v %v", ErrParse, tks[0].TmY, tks[1].TmY) | |
} else if len(tks[0].Tj) == 0 { | |
return nil, fmt.Errorf("%w: invalid student name", ErrParse) | |
} else if len(tks[1].Tj) != 8 { | |
return nil, fmt.Errorf("%w: invalid student number", ErrParse) | |
} else if n, err := strconv.ParseUint(tks[1].Tj, 10, 64); err != nil { | |
return nil, fmt.Errorf("%w: invalid student number", ErrParse) | |
} else { | |
r.StudentName = tks[0].Tj | |
r.StudentNumber = int(n) | |
} | |
if tks, err := tkz.Tokens(2); err == io.ErrUnexpectedEOF { | |
return nil, fmt.Errorf("%w: incomplete institution", ErrParse) | |
} else if err != nil { | |
return nil, fmt.Errorf("read institution: %w", err) | |
} else if tks[0].Tj != "Institution:" { | |
return nil, fmt.Errorf("%w: invalid institution (0: %q)", ErrParse, tks[0].Tj) | |
} else if v := strings.TrimSpace(tks[1].Tj); v != "Queen's University" { | |
return nil, fmt.Errorf("%w: unknown institution (got %q)", ErrParse, tks[1].Tj) | |
} else { | |
r.Institution = v | |
} | |
if tks, err := tkz.Tokens(2); err == io.ErrUnexpectedEOF { | |
return nil, fmt.Errorf("%w: incomplete career", ErrParse) | |
} else if err != nil { | |
return nil, fmt.Errorf("read career: %w", err) | |
} else if tks[0].Tj != "Academic Career:" { | |
return nil, fmt.Errorf("%w: invalid career (0: %q)", ErrParse, tks[0].Tj) | |
} else if v := strings.TrimSpace(tks[1].Tj); len(v) == 0 { | |
return nil, fmt.Errorf("%w: unknown career (got %q)", ErrParse, tks[1].Tj) | |
} else { | |
r.Career = v | |
} | |
if tks, err := tkz.Tokens(2); err == io.ErrUnexpectedEOF { | |
return nil, fmt.Errorf("%w: incomplete term", ErrParse) | |
} else if err != nil { | |
return nil, fmt.Errorf("read term: %w", err) | |
} else if tks[0].Tj != "Term:" { | |
return nil, fmt.Errorf("%w: invalid term (0: %q)", ErrParse, tks[0].Tj) | |
} else if m := reTerm.FindStringSubmatch(tks[1].Tj); m == nil { | |
return nil, fmt.Errorf("%w: invalid term (got %q)", ErrParse, tks[1].Tj) | |
} else { | |
r.Term = m[1] | |
} | |
if tk, err := tkz.Token(); err != nil && err != io.EOF { | |
return r, fmt.Errorf("read minimum enrollment message: %w", err) | |
} else if v := strings.TrimSpace(tk.Tj); v != "* Data is not displayed for courses with fewer than 10 enrollments." { | |
return r, fmt.Errorf("%w: expected minimum enrollment message, got %q", ErrParse, v) | |
} | |
if tks, err := tkz.Tokens(3 + int(MaxGrade)); err == io.ErrUnexpectedEOF { | |
return nil, fmt.Errorf("%w: incomplete table header", ErrParse) | |
} else if err != nil { | |
return nil, fmt.Errorf("read table header: %w", err) | |
} else { | |
for i, tk := range tks { | |
if tk.TmY != tks[0].TmY { | |
return nil, fmt.Errorf("%w: got %d table column headers on the same line, expected more", ErrParse, i) | |
} | |
var x string | |
switch i { | |
case 0: | |
x = "Course" | |
case 1: | |
x = "Description" | |
case 2: | |
x = "Enrollment" | |
default: | |
x = Grade(i - 3).String() | |
} | |
if tk.Tj != x { | |
return nil, fmt.Errorf("%w: expected table header %d to be %q (got %q)", ErrParse, i+1, x, tk.Tj) | |
} | |
} | |
} | |
for ry := math.NaN(); ; { | |
var c Course | |
if tk, err := tkz.Token(); err == io.EOF { | |
break | |
} else if err != nil { | |
return r, fmt.Errorf("read next row course header part 1: %w", err) | |
} else if tk.TmY == ry { | |
return r, fmt.Errorf("%w: read next course header part 1: unexpected text %q on same line as previous row", ErrParse, tk.Tj) | |
} else if m := reCourse.FindStringSubmatch(tk.Tj); m == nil { | |
return r, fmt.Errorf("%w: invalid course header part 1 (got %q)", ErrParse, tk.Tj) | |
} else { | |
c.Name = m[1] | |
ry = tk.TmY | |
} | |
if tk, err := tkz.Token(); err == io.EOF { | |
return r, fmt.Errorf("%w: missing row course header part 2", ErrParse) | |
} else if err != nil { | |
return r, fmt.Errorf("read next row course header part 2: %w", err) | |
} else if tk.TmY != ry { | |
return r, fmt.Errorf("%w: read next course header part 2: expected y position %v, got %s", ErrParse, ry, tk) | |
} else if v := strings.TrimSpace(tk.Tj); len(v) == 0 { | |
return r, fmt.Errorf("%w: invalid course header part 2 (got %q)", ErrParse, v) | |
} else { | |
c.Name += " " + v | |
} | |
if tk, err := tkz.Token(); err == io.EOF { | |
return r, fmt.Errorf("%w: missing row desc header first value", ErrParse) | |
} else if err != nil { | |
return r, fmt.Errorf("read row desc header first value: %w", err) | |
} else if tk.TmY != ry { | |
return r, fmt.Errorf("%w: read next desc header first value: expected y position %v, got %s", ErrParse, ry, tk) | |
} else if v := strings.TrimSpace(tk.Tj); len(v) == 0 { | |
return r, fmt.Errorf("%w: invalid desc header first value (got %q)", ErrParse, v) | |
} else { | |
c.Description = v | |
} | |
for { | |
if tk, err := tkz.Token(); err == io.EOF { | |
return r, fmt.Errorf("%w: missing row enrollment header (after description %q)", ErrParse, c.Description) | |
} else if err != nil { | |
return r, fmt.Errorf("read next row desc header value or enrollment header: %w", err) | |
} else if tk.TmY == ry { | |
if v, err := strconv.ParseUint(tk.Tj, 10, 64); err != nil { | |
return r, fmt.Errorf("%w: invalid row enrollment header (got %q)", ErrParse, tk.Tj) | |
} else { | |
c.Enrollment = int(v) | |
break | |
} | |
} else { | |
if v := strings.TrimSpace(tk.Tj); len(v) != 0 { | |
c.Description += " " + v | |
} | |
} | |
} | |
for g := Grade(0); g < MaxGrade; g++ { | |
if tk, err := tkz.Token(); err != nil { | |
return r, fmt.Errorf("read distribution value for grade %s: %w", g, err) | |
} else if v, err := strconv.ParseUint(tk.Tj, 10, 64); err != nil { | |
return r, fmt.Errorf("%w: invalid distribution value for grade %s (got %q)", ErrParse, g, tk.Tj) | |
} else if v > 100 { | |
return r, fmt.Errorf("%w: distribution value out of range for grade %s (got %q)", ErrParse, g, tk.Tj) | |
} else { | |
c.Distribution[g] = int(v) | |
} | |
} | |
r.Course = append(r.Course, c) | |
} | |
if len(r.Course) == 0 { | |
return r, fmt.Errorf("%w: no courses found in report", ErrParse) | |
} | |
return r, nil | |
} | |
type pdfTokenizer struct { | |
err error | |
errCh chan error | |
token chan pdfToken | |
cancel chan struct{} | |
} | |
type pdfToken struct { | |
Tj string | |
TmX float64 // the last Tm X value | |
TmY float64 // the last Tm Y value | |
} | |
func (t pdfToken) String() string { | |
return fmt.Sprintf("%q @ (%v, %v)", t.Tj, t.TmX, t.TmY) | |
} | |
func newPDFTokenizer(strm pdf.Value) *pdfTokenizer { | |
tkz := &pdfTokenizer{ | |
errCh: make(chan error, 1), | |
token: make(chan pdfToken), | |
cancel: make(chan struct{}), | |
} | |
go func() { | |
defer close(tkz.token) | |
defer func() { | |
if x := recover(); x != nil { | |
switch x := x.(type) { | |
case error: | |
tkz.errCh <- fmt.Errorf("pdf parse error: %w", x) | |
default: | |
tkz.errCh <- fmt.Errorf("pdf parse error: %v", x) | |
} | |
close(tkz.errCh) | |
} | |
}() | |
if strm.Kind() != pdf.Stream { | |
panic("contents not a stream") | |
} | |
var tk pdfToken | |
pdf.Interpret(strm, func(stk *pdf.Stack, op string) { | |
select { | |
case <-tkz.cancel: | |
panic("reader closed") | |
default: | |
} | |
switch op { | |
case "Tm": | |
if v := stk.Pop(); v.Kind() != pdf.Real && v.Kind() != pdf.Integer { | |
panic("Tm instruction vertical translation invalid or missing") | |
} else { | |
tk.TmY = v.Float64() | |
} | |
if v := stk.Pop(); v.Kind() != pdf.Real && v.Kind() != pdf.Integer { | |
panic("Tm instruction horizontal translation invalid or missing") | |
} else { | |
tk.TmX = v.Float64() | |
} | |
case "Tj": | |
if v := stk.Pop(); v.Kind() != pdf.String { | |
panic("Tj instruction text invalid or missing") | |
} else { | |
tk.Tj = v.Text() | |
} | |
} | |
for stk.Len() != 0 { | |
stk.Pop() | |
} | |
if op == "Tj" { | |
select { | |
case <-tkz.cancel: | |
panic("reader closed") | |
case tkz.token <- tk: | |
} | |
} | |
}) | |
}() | |
return tkz | |
} | |
func (t *pdfTokenizer) Token() (pdfToken, error) { | |
if t.err != nil { | |
return pdfToken{}, t.err | |
} | |
tk, ok := <-t.token | |
if !ok { | |
select { | |
case err := <-t.errCh: | |
t.err = err | |
default: | |
t.err = io.EOF | |
} | |
return pdfToken{}, t.err | |
} | |
return tk, nil | |
} | |
func (t *pdfTokenizer) Tokens(n int) ([]pdfToken, error) { | |
tks := make([]pdfToken, 0, n) | |
for i := 0; i < n; i++ { | |
tk, err := t.Token() | |
if err == io.EOF { | |
err = io.ErrUnexpectedEOF | |
} | |
if err != nil { | |
return tks, err | |
} | |
tks = append(tks, tk) | |
} | |
return tks, nil | |
} | |
func (t *pdfTokenizer) Close() { | |
close(t.cancel) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment