Created
April 1, 2009 07:58
-
-
Save metade/88607 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'treetop' | |
require 'pp' | |
Treetop.load(File.join(File.dirname(__FILE__), "lonclass.treetop")) | |
class LonclassParser | |
def self.parse(string) | |
parser = LonclassParser.new | |
parser.parse(string) | |
end | |
def initialize | |
@grammar = LonclassGrammarParser.new | |
end | |
def parse(string) | |
doc = @grammar.parse(string) | |
if doc.nil? | |
@grammar.terminal_failures.each do |tf| | |
$stderr.puts "Expected #{tf.expected_string.inspect} (#{tf.index})- #{string[tf.index,10].inspect}" | |
end | |
return {} | |
end | |
doc.elements.map do |l| | |
# pp l.subject | |
subjects = [] | |
l.subjects.elements.each do |s| | |
number = s.respond_to?(:number) ? s.number.text_value : nil | |
name, date, location, company = extract_extensions(s) | |
subjects << { | |
:number => number, | |
:name_extension => name, | |
:date_extension => date, | |
:location_extension => location, | |
:company_extension => company, | |
} | |
end | |
{ :number => l.number.text_value, | |
:company => (l.company.respond_to?(:name) ? l.company.name.text_value : nil), | |
:subjects => subjects, | |
:message => l.message.text_value | |
} | |
end | |
end | |
protected | |
def extract_extensions(subject) | |
name, date, location, company = nil, nil, nil | |
extensions = [] | |
if subject.respond_to?(:extensions) | |
company = subject.company.text_value.blank? ? nil : subject.company.name.text_value | |
extensions = subject.extensions.elements | |
else | |
extensions = [subject] | |
end | |
extensions.each do |e| | |
if e.respond_to?(:date) | |
date = e.date.text_value | |
elsif e.respond_to?(:location) | |
location = e.location.text_value | |
else | |
name = e.text_value | |
end | |
end | |
[name, date, location, company] | |
end | |
end | |
if (__FILE__ == $0) | |
file = File.open('data/converted-LONCLASS_con.txt') | |
string = file.read | |
string.each_with_index do |line,i| | |
puts i | |
puts line | |
details = LonclassParser.parse(line) | |
break if (details == {}) | |
pp details | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
grammar LonclassGrammar | |
rule lonclass | |
lines | |
end | |
rule lines | |
line+ | |
end | |
rule line | |
number '|' company:company? subjects:subject* ' '+ '|' message eol | |
end | |
rule number | |
[0-9]+ | |
end | |
rule company | |
'[' name:char+ ']' | |
end | |
rule subject | |
extension / (number:number+ extensions:extension* ':'? company:company?) | |
end | |
rule extension | |
location_extension / date_extension / name_extension | |
end | |
rule number | |
subject_class+ | |
end | |
rule subject_class | |
('.' / '-' / '/')* [0-9]+ | |
end | |
rule date_extension | |
'"' date:[. 0-9]+ '"' | |
end | |
rule location_extension | |
'(' location:subject ')' | |
end | |
rule name_extension | |
([A-Z] ([,A-Z] / ' ' [,0-9A-Z])+) | |
end | |
rule message | |
char+ | |
end | |
rule char | |
['A-Z0-9& ()-/] | |
end | |
rule eol | |
("\r" "\n"?) / "\n" | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env spec | |
# via "sudo gem install rspec" | |
require 'lonclass' | |
describe "Lonclass parser" do | |
it "should parse a line with a blank company name" do | |
str = "80276|[ ] |COMPANIES\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[], | |
:message=>"COMPANIES", | |
:number=>"80276", | |
:company=>" " | |
}] | |
end | |
it "should parse a line with a company name" do | |
str = "80277|[A AND W] |A AND W (FAST FOOD COMPANY)\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[], | |
:message=>"A AND W (FAST FOOD COMPANY)", | |
:number=>"80277", | |
:company=>"A AND W" | |
}] | |
end | |
it "should parse a line that has a number in the company name" do | |
str = "80342|[AIRE O2] |AIRE O2 (COMPANY)\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[], | |
:message=>"AIRE O2 (COMPANY)", | |
:number=>"80342", | |
:company=>"AIRE O2" | |
}] | |
end | |
it "should parse a line with a company name and a quote in the message" do | |
str = "80279|[A J BRETT] |A' J BRETT (ANTIQUE RESTORERS)\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[], | |
:message=>"A' J BRETT (ANTIQUE RESTORERS)", | |
:number=>"80279", | |
:company=>"A J BRETT" | |
}] | |
end | |
it "should parse a line with a company name and a subject number" do | |
str = "80285|[ABB TRANSPORTATION].007.004.761 |JOB LOSSES AT ABB TRANSPORTATION\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[ | |
{:number=>".007.004.761", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil, :company_extension=>nil} | |
], | |
:message=>"JOB LOSSES AT ABB TRANSPORTATION", | |
:number=>"80285", | |
:company=>"ABB TRANSPORTATION" | |
}] | |
end | |
it "should parse a line with a company name and a subject number with a name extension" do | |
str = "80309|[AD DAF].008.02LEYLAND DAF |LLEYLAND DAF (COMPANY)\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[ | |
{:number=>".008.02", :name_extension=>"LEYLAND DAF", :location_extension=>nil, :date_extension=>nil, :company_extension=>nil} | |
], | |
:message=>"LLEYLAND DAF (COMPANY)", | |
:number=>"80309", | |
:company=>"AD DAF" | |
}] | |
end | |
it "should parse a line with a company name and a conjoined subject number" do | |
str = "80720|[BEDFORD].008.24TRUCK.004.6 |DECLINE OF BEDFORD TRUCKS (COMPANY)\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[ | |
{:number=>".008.24", :name_extension=>"TRUCK", :location_extension=>nil, :date_extension=>nil, :company_extension=>nil}, | |
{:number=>'.004.6', :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil}, | |
], | |
:message=>"DECLINE OF BEDFORD TRUCKS (COMPANY)", | |
:number=>"80720", | |
:company=>"BEDFORD" | |
}] | |
end | |
it "should parse a line with a company name and 2 related subject numbers" do | |
str = "80586|[BARLOW CLOWES]658.111:332.6 |BARLOW CLOWES INVESTMENT GROUP\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[ | |
{:number=>"658.111", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil}, | |
{:number=>'332.6', :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil}, | |
], | |
:message=>"BARLOW CLOWES INVESTMENT GROUP", | |
:number=>"80586", | |
:company=>"BARLOW CLOWES" | |
}] | |
end | |
it "should parse a line with a company name and 2 related subject numbers" do | |
str = "80316|[ADIDAS]301.161.1:3-058.1:796.007.009.031:796.334.1.007(429) |ADIDAS BOOT MONEY SCANDAL (PAYMENTS TO AMATEUR WELSH RUGBY PLAYERS)\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[ | |
{:number=>"301.161.1", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil}, | |
{:number=>'3-058.1', :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil}, | |
{:number=>'796.007.009.031', :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil}, | |
{:number=>'796.334.1.007', :name_extension=>nil, :location_extension=>"429", :date_extension=>nil, :company_extension=>nil}, | |
], | |
:message=>"ADIDAS BOOT MONEY SCANDAL (PAYMENTS TO AMATEUR WELSH RUGBY PLAYERS)", | |
:number=>"80316", | |
:company=>"ADIDAS" | |
}] | |
end | |
it "should parse a line with a company name and subject number with a date" do | |
str = "80404|[AMERICAN EXPRESS].093\"1991\" |AMERICAN EXPRESS BANK AWARD 1991\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[ | |
{:number=>".093", :name_extension=>nil, :location_extension=>nil, :date_extension=>"1991", :company_extension=>nil}, | |
], | |
:message=>"AMERICAN EXPRESS BANK AWARD 1991", | |
:number=>"80404", | |
:company=>"AMERICAN EXPRESS" | |
}] | |
end | |
it "should parse a line with a company name and subject number with a date" do | |
str = "270789|621.452.002.793.2 |JET ENGINE MUFFLERS\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[ | |
{:number=>"621.452.002.793.2", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil}, | |
], | |
:message=>"JET ENGINE MUFFLERS", | |
:number=>"270789", | |
:company=>nil | |
}] | |
end | |
it "should parse a line with a slash in the location extension" do | |
str = "270790|321.61(421/425)JAMES II |JAMES II (BRITISH KING)\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[ | |
{:number=>"321.61", :name_extension=>"JAMES II", :location_extension=>"421/425", :date_extension=>nil, :company_extension=>nil}, | |
], | |
:message=>"JAMES II (BRITISH KING)", | |
:number=>"270790", | |
:company=>nil | |
}] | |
end | |
it "should parse a line with a relation after a name extension" do | |
str = "80670|[BBC]654.192.77SUB:654.19:629.195 |BBC SUBSCRIPTION CHANNEL\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[ | |
{:number=>"654.192.77", :name_extension=>"SUB", :location_extension=>nil, :date_extension=>nil, :company_extension=>nil}, | |
{:number=>"654.19", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil}, | |
{:number=>"629.195", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil}, | |
], | |
:message=>"BBC SUBSCRIPTION CHANNEL", | |
:number=>"80670", | |
:company=>"BBC" | |
}] | |
end | |
it "should parse a line with a location extension after a name extension" do | |
str = "80681|[BBC]654.193ENGLISH(73) |BBC WORLD SERVICE RADIO BROADCAST TO USA\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[ | |
{:number=>"654.193", :name_extension=>"ENGLISH", :location_extension=>"73", :date_extension=>nil, :company_extension=>nil}, | |
], | |
:message=>"BBC WORLD SERVICE RADIO BROADCAST TO USA", | |
:number=>"80681", | |
:company=>"BBC" | |
}] | |
end | |
it "should parse a line with multiple subject numbers" do | |
str = "80809|[BNOC]338.532.31:665.4/.5(261.2) |BNOC CUT PRICE OF NORTH SEA OIL\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[ | |
{:number=>"338.532.31", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil}, | |
{:number=>"665.4/.5", :name_extension=>nil, :location_extension=>'261.2', :date_extension=>nil, :company_extension=>nil}, | |
], | |
:message=>"BNOC CUT PRICE OF NORTH SEA OIL", | |
:number=>"80809", | |
:company=>"BNOC" | |
}] | |
end | |
it "should parse a line with a company number followed by a location" do | |
str = "80862|[BRENT WALKER](047.1) |BRENT WALKER ANNUAL REPORTS\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[ | |
{:number=>nil, :name_extension=>nil, :location_extension=>'047.1', :date_extension=>nil, :company_extension=>nil}, | |
], | |
:message=>"BRENT WALKER ANNUAL REPORTS", | |
:number=>"80862", | |
:company=>"BRENT WALKER" | |
}] | |
end | |
it "should parse a line with a company number followed by another company number" do | |
str = "80923|[BRITISH LEYLAND].008.01[JAGUAR] |JAGUAR CARS LTD (BL SUB GROUP)\n" | |
LonclassParser.parse(str).should == [{ | |
:subjects=>[ | |
{:number=>'.008.01', :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension => 'JAGUAR'}, | |
], | |
:message=>"JAGUAR CARS LTD (BL SUB GROUP)", | |
:number=>"80923", | |
:company=>"BRITISH LEYLAND" | |
}] | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment