Skip to content

Instantly share code, notes, and snippets.

@mauricioszabo
Created March 24, 2011 19:34
Show Gist options
  • Save mauricioszabo/885690 to your computer and use it in GitHub Desktop.
Save mauricioszabo/885690 to your computer and use it in GitHub Desktop.
Parsing de disciplinas da matrícula da UFABC. Requer a gem pdf-parser
require 'rubygems'
require 'pdf/reader'
require "re_parser"
require "yaml"
class PageTextReceiver
attr_accessor :disciplinas
def initialize
@disciplinas = ReParser.new
end
# Called when page parsing starts
def begin_page(arg = nil)
end
# record text that is drawn on the page
def show_text(string, *params)
end
# there's a few text callbacks, so make sure we process them all
alias :super_show_text :show_text
alias :move_to_next_line_and_show_text :show_text
alias :set_spacing_next_line_show_text :show_text
# this final text callback takes slightly different arguments
def show_text_with_positioning(params, *other)
str = ''
params.each_with_index { |s, i| str << s if i.even? }
@disciplinas.parse str
p @disciplinas.disciplinas.last
end
end
receiver = PageTextReceiver.new
pdf = PDF::Reader.file("disciplinas_ofertadas_2011_2", receiver)
File.open "disciplinas.yml", "w" do |f|
f.print receiver.disciplinas.disciplinas.collect { |b| Hash[*b.members.zip(b.values).flatten(1)] }.to_yaml
end
Disciplina = Struct.new :codigo, :nome, :turma, :turno, :t, :p, :i, :horarios, :campus, :cat_bct, :cat_bch
class ReParser
attr_reader :disciplinas
def initialize
@disciplinas = []
@cont = 0
end
def parse(string)
string.chop
if string =~ /^\w{2}\d{4}$/
@cont = 0
@disciplinas << Disciplina.new(string)
elsif @disciplinas.last
@cont += 1
parse_other(string, @disciplinas.last)
end
end
def parse_other(string, disciplina)
case @cont
when 1
disciplina.nome = string
when 2
if string !~ /^\w\d?$/
@cont -= 1
disciplina.nome << string
else
disciplina.turma = string
end
when 3 then disciplina.turno = string
when 4 then disciplina.t = string
when 5 then disciplina.p = string
when 6 then disciplina.i = string
when 7
disciplina.horarios ||= []
disciplina.horarios << string.gsub(/,\s*/, '')
when 8
if ["SA", "SB"].include?(string)
disciplina.campus = string
else
@cont -= 1
parse_other(string, disciplina)
end
when 9
when 10
#if string =~ /limitada/
disciplina.cat_bct = string
when 11
if string =~ /limitada/
disciplina.cat_bct << string
@cont -= 1
else
disciplina.cat_bch = string
end
when 12
disciplina.cat_bch << string if string =~ /li/
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment