Created
March 24, 2011 19:34
-
-
Save mauricioszabo/885690 to your computer and use it in GitHub Desktop.
Parsing de disciplinas da matrícula da UFABC. Requer a gem pdf-parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'pdf/reader' | |
require "re_parser" | |
require "yaml" | |
class PageTextReceiver | |
attr_accessor :disciplinas | |
def initialize | |
@disciplinas = ReParser.new | |
end | |
# Called when page parsing starts | |
def begin_page(arg = nil) | |
end | |
# record text that is drawn on the page | |
def show_text(string, *params) | |
end | |
# there's a few text callbacks, so make sure we process them all | |
alias :super_show_text :show_text | |
alias :move_to_next_line_and_show_text :show_text | |
alias :set_spacing_next_line_show_text :show_text | |
# this final text callback takes slightly different arguments | |
def show_text_with_positioning(params, *other) | |
str = '' | |
params.each_with_index { |s, i| str << s if i.even? } | |
@disciplinas.parse str | |
p @disciplinas.disciplinas.last | |
end | |
end | |
receiver = PageTextReceiver.new | |
pdf = PDF::Reader.file("disciplinas_ofertadas_2011_2", receiver) | |
File.open "disciplinas.yml", "w" do |f| | |
f.print receiver.disciplinas.disciplinas.collect { |b| Hash[*b.members.zip(b.values).flatten(1)] }.to_yaml | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Disciplina = Struct.new :codigo, :nome, :turma, :turno, :t, :p, :i, :horarios, :campus, :cat_bct, :cat_bch | |
class ReParser | |
attr_reader :disciplinas | |
def initialize | |
@disciplinas = [] | |
@cont = 0 | |
end | |
def parse(string) | |
string.chop | |
if string =~ /^\w{2}\d{4}$/ | |
@cont = 0 | |
@disciplinas << Disciplina.new(string) | |
elsif @disciplinas.last | |
@cont += 1 | |
parse_other(string, @disciplinas.last) | |
end | |
end | |
def parse_other(string, disciplina) | |
case @cont | |
when 1 | |
disciplina.nome = string | |
when 2 | |
if string !~ /^\w\d?$/ | |
@cont -= 1 | |
disciplina.nome << string | |
else | |
disciplina.turma = string | |
end | |
when 3 then disciplina.turno = string | |
when 4 then disciplina.t = string | |
when 5 then disciplina.p = string | |
when 6 then disciplina.i = string | |
when 7 | |
disciplina.horarios ||= [] | |
disciplina.horarios << string.gsub(/,\s*/, '') | |
when 8 | |
if ["SA", "SB"].include?(string) | |
disciplina.campus = string | |
else | |
@cont -= 1 | |
parse_other(string, disciplina) | |
end | |
when 9 | |
when 10 | |
#if string =~ /limitada/ | |
disciplina.cat_bct = string | |
when 11 | |
if string =~ /limitada/ | |
disciplina.cat_bct << string | |
@cont -= 1 | |
else | |
disciplina.cat_bch = string | |
end | |
when 12 | |
disciplina.cat_bch << string if string =~ /li/ | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment