public
Last active

  • Download Gist
.gitignore
1
data/
lonclass.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
#!/usr/bin/env ruby
 
require 'rubygems'
require 'treetop'
require 'pp'
 
Treetop.load(File.join(File.dirname(__FILE__), "lonclass.treetop"))
 
class LonclassParser
def self.parse(string)
parser = LonclassParser.new
parser.parse(string)
end
def initialize
@grammar = LonclassGrammarParser.new
end
def parse(string)
doc = @grammar.parse(string)
if doc.nil?
@grammar.terminal_failures.each do |tf|
$stderr.puts "Expected #{tf.expected_string.inspect} (#{tf.index})- #{string[tf.index,10].inspect}"
end
return {}
end
doc.elements.map do |l|
# pp l.subject
subjects = []
l.subjects.elements.each do |s|
number = s.respond_to?(:number) ? s.number.text_value : nil
name, date, location, company = extract_extensions(s)
subjects << {
:number => number,
:name_extension => name,
:date_extension => date,
:location_extension => location,
:company_extension => company,
}
end
{ :number => l.number.text_value,
:company => (l.company.respond_to?(:name) ? l.company.name.text_value : nil),
:subjects => subjects,
:message => l.message.text_value
}
end
end
protected
def extract_extensions(subject)
name, date, location, company = nil, nil, nil
extensions = []
if subject.respond_to?(:extensions)
company = subject.company.text_value.blank? ? nil : subject.company.name.text_value
extensions = subject.extensions.elements
else
extensions = [subject]
end
extensions.each do |e|
if e.respond_to?(:date)
date = e.date.text_value
elsif e.respond_to?(:location)
location = e.location.text_value
else
name = e.text_value
end
end
[name, date, location, company]
end
end
 
if (__FILE__ == $0)
file = File.open('data/converted-LONCLASS_con.txt')
string = file.read
string.each_with_index do |line,i|
puts i
puts line
details = LonclassParser.parse(line)
break if (details == {})
pp details
end
end
lonclass.treetop
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
grammar LonclassGrammar
rule lonclass
lines
end
rule lines
line+
end
rule line
number '|' company:company? subjects:subject* ' '+ '|' message eol
end
rule number
[0-9]+
end
rule company
'[' name:char+ ']'
end
rule subject
extension / (number:number+ extensions:extension* ':'? company:company?)
end
rule extension
location_extension / date_extension / name_extension
end
rule number
subject_class+
end
rule subject_class
('.' / '-' / '/')* [0-9]+
end
rule date_extension
'"' date:[. 0-9]+ '"'
end
rule location_extension
'(' location:subject ')'
end
rule name_extension
([A-Z] ([,A-Z] / ' ' [,0-9A-Z])+)
end
rule message
char+
end
rule char
['A-Z0-9& ()-/]
end
rule eol
("\r" "\n"?) / "\n"
end
end
lonclass_spec.rb
Ruby

#!/usr/bin/env spec
# via "sudo gem install rspec"
 
require 'lonclass'
 
describe "Lonclass parser" do
it "should parse a line with a blank company name" do
str = "80276|[ ] |COMPANIES\n"
LonclassParser.parse(str).should == [{
:subjects=>[],
:message=>"COMPANIES",
:number=>"80276",
:company=>" "
}]
end
it "should parse a line with a company name" do
str = "80277|[A AND W] |A AND W (FAST FOOD COMPANY)\n"
LonclassParser.parse(str).should == [{
:subjects=>[],
:message=>"A AND W (FAST FOOD COMPANY)",
:number=>"80277",
:company=>"A AND W"
}]
end
it "should parse a line that has a number in the company name" do
str = "80342|[AIRE O2] |AIRE O2 (COMPANY)\n"
LonclassParser.parse(str).should == [{
:subjects=>[],
:message=>"AIRE O2 (COMPANY)",
:number=>"80342",
:company=>"AIRE O2"
}]
end
it "should parse a line with a company name and a quote in the message" do
str = "80279|[A J BRETT] |A' J BRETT (ANTIQUE RESTORERS)\n"
LonclassParser.parse(str).should == [{
:subjects=>[],
:message=>"A' J BRETT (ANTIQUE RESTORERS)",
:number=>"80279",
:company=>"A J BRETT"
}]
end
it "should parse a line with a company name and a subject number" do
str = "80285|[ABB TRANSPORTATION].007.004.761 |JOB LOSSES AT ABB TRANSPORTATION\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>".007.004.761", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil, :company_extension=>nil}
],
:message=>"JOB LOSSES AT ABB TRANSPORTATION",
:number=>"80285",
:company=>"ABB TRANSPORTATION"
}]
end
it "should parse a line with a company name and a subject number with a name extension" do
str = "80309|[AD DAF].008.02LEYLAND DAF |LLEYLAND DAF (COMPANY)\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>".008.02", :name_extension=>"LEYLAND DAF", :location_extension=>nil, :date_extension=>nil, :company_extension=>nil}
],
:message=>"LLEYLAND DAF (COMPANY)",
:number=>"80309",
:company=>"AD DAF"
}]
end
it "should parse a line with a company name and a conjoined subject number" do
str = "80720|[BEDFORD].008.24TRUCK.004.6 |DECLINE OF BEDFORD TRUCKS (COMPANY)\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>".008.24", :name_extension=>"TRUCK", :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
{:number=>'.004.6', :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
],
:message=>"DECLINE OF BEDFORD TRUCKS (COMPANY)",
:number=>"80720",
:company=>"BEDFORD"
}]
end
it "should parse a line with a company name and 2 related subject numbers" do
str = "80586|[BARLOW CLOWES]658.111:332.6 |BARLOW CLOWES INVESTMENT GROUP\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>"658.111", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
{:number=>'332.6', :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
],
:message=>"BARLOW CLOWES INVESTMENT GROUP",
:number=>"80586",
:company=>"BARLOW CLOWES"
}]
end
it "should parse a line with a company name and 2 related subject numbers" do
str = "80316|[ADIDAS]301.161.1:3-058.1:796.007.009.031:796.334.1.007(429) |ADIDAS BOOT MONEY SCANDAL (PAYMENTS TO AMATEUR WELSH RUGBY PLAYERS)\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>"301.161.1", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
{:number=>'3-058.1', :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
{:number=>'796.007.009.031', :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
{:number=>'796.334.1.007', :name_extension=>nil, :location_extension=>"429", :date_extension=>nil, :company_extension=>nil},
],
:message=>"ADIDAS BOOT MONEY SCANDAL (PAYMENTS TO AMATEUR WELSH RUGBY PLAYERS)",
:number=>"80316",
:company=>"ADIDAS"
}]
end
it "should parse a line with a company name and subject number with a date" do
str = "80404|[AMERICAN EXPRESS].093\"1991\" |AMERICAN EXPRESS BANK AWARD 1991\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>".093", :name_extension=>nil, :location_extension=>nil, :date_extension=>"1991", :company_extension=>nil},
],
:message=>"AMERICAN EXPRESS BANK AWARD 1991",
:number=>"80404",
:company=>"AMERICAN EXPRESS"
}]
end
it "should parse a line with a company name and subject number with a date" do
str = "270789|621.452.002.793.2 |JET ENGINE MUFFLERS\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>"621.452.002.793.2", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
],
:message=>"JET ENGINE MUFFLERS",
:number=>"270789",
:company=>nil
}]
end
it "should parse a line with a slash in the location extension" do
str = "270790|321.61(421/425)JAMES II |JAMES II (BRITISH KING)\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>"321.61", :name_extension=>"JAMES II", :location_extension=>"421/425", :date_extension=>nil, :company_extension=>nil},
],
:message=>"JAMES II (BRITISH KING)",
:number=>"270790",
:company=>nil
}]
end
it "should parse a line with a relation after a name extension" do
str = "80670|[BBC]654.192.77SUB:654.19:629.195 |BBC SUBSCRIPTION CHANNEL\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>"654.192.77", :name_extension=>"SUB", :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
{:number=>"654.19", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
{:number=>"629.195", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
],
:message=>"BBC SUBSCRIPTION CHANNEL",
:number=>"80670",
:company=>"BBC"
}]
end
it "should parse a line with a location extension after a name extension" do
str = "80681|[BBC]654.193ENGLISH(73) |BBC WORLD SERVICE RADIO BROADCAST TO USA\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>"654.193", :name_extension=>"ENGLISH", :location_extension=>"73", :date_extension=>nil, :company_extension=>nil},
],
:message=>"BBC WORLD SERVICE RADIO BROADCAST TO USA",
:number=>"80681",
:company=>"BBC"
}]
end
it "should parse a line with multiple subject numbers" do
str = "80809|[BNOC]338.532.31:665.4/.5(261.2) |BNOC CUT PRICE OF NORTH SEA OIL\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>"338.532.31", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
{:number=>"665.4/.5", :name_extension=>nil, :location_extension=>'261.2', :date_extension=>nil, :company_extension=>nil},
],
:message=>"BNOC CUT PRICE OF NORTH SEA OIL",
:number=>"80809",
:company=>"BNOC"
}]
end
it "should parse a line with a company number followed by a location" do
str = "80862|[BRENT WALKER](047.1) |BRENT WALKER ANNUAL REPORTS\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>nil, :name_extension=>nil, :location_extension=>'047.1', :date_extension=>nil, :company_extension=>nil},
],
:message=>"BRENT WALKER ANNUAL REPORTS",
:number=>"80862",
:company=>"BRENT WALKER"
}]
end
it "should parse a line with a company number followed by another company number" do
str = "80923|[BRITISH LEYLAND].008.01[JAGUAR] |JAGUAR CARS LTD (BL SUB GROUP)\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>'.008.01', :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension => 'JAGUAR'},
],
:message=>"JAGUAR CARS LTD (BL SUB GROUP)",
:number=>"80923",
:company=>"BRITISH LEYLAND"
}]
end
end

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.