public
Last active

  • Download Gist
.gitignore
1
data/
lonclass.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
#!/usr/bin/env ruby
 
require 'rubygems'
require 'treetop'
require 'pp'
 
Treetop.load(File.join(File.dirname(__FILE__), "lonclass.treetop"))
 
class LonclassParser
def self.parse(string)
parser = LonclassParser.new
parser.parse(string)
end
def initialize
@grammar = LonclassGrammarParser.new
end
def parse(string)
doc = @grammar.parse(string)
if doc.nil?
@grammar.terminal_failures.each do |tf|
$stderr.puts "Expected #{tf.expected_string.inspect} (#{tf.index})- #{string[tf.index,10].inspect}"
end
return {}
end
doc.elements.map do |l|
# pp l.subject
subjects = []
l.subjects.elements.each do |s|
number = s.respond_to?(:number) ? s.number.text_value : nil
name, date, location, company = extract_extensions(s)
subjects << {
:number => number,
:name_extension => name,
:date_extension => date,
:location_extension => location,
:company_extension => company,
}
end
{ :number => l.number.text_value,
:company => (l.company.respond_to?(:name) ? l.company.name.text_value : nil),
:subjects => subjects,
:message => l.message.text_value
}
end
end
protected
def extract_extensions(subject)
name, date, location, company = nil, nil, nil
extensions = []
if subject.respond_to?(:extensions)
company = subject.company.text_value.blank? ? nil : subject.company.name.text_value
extensions = subject.extensions.elements
else
extensions = [subject]
end
extensions.each do |e|
if e.respond_to?(:date)
date = e.date.text_value
elsif e.respond_to?(:location)
location = e.location.text_value
else
name = e.text_value
end
end
[name, date, location, company]
end
end
 
if (__FILE__ == $0)
file = File.open('data/converted-LONCLASS_con.txt')
string = file.read
string.each_with_index do |line,i|
puts i
puts line
details = LonclassParser.parse(line)
break if (details == {})
pp details
end
end
lonclass.treetop
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
grammar LonclassGrammar
rule lonclass
lines
end
rule lines
line+
end
rule line
number '|' company:company? subjects:subject* ' '+ '|' message eol
end
rule number
[0-9]+
end
rule company
'[' name:char+ ']'
end
rule subject
extension / (number:number+ extensions:extension* ':'? company:company?)
end
rule extension
location_extension / date_extension / name_extension
end
rule number
subject_class+
end
rule subject_class
('.' / '-' / '/')* [0-9]+
end
rule date_extension
'"' date:[. 0-9]+ '"'
end
rule location_extension
'(' location:subject ')'
end
rule name_extension
([A-Z] ([,A-Z] / ' ' [,0-9A-Z])+)
end
rule message
char+
end
rule char
['A-Z0-9& ()-/]
end
rule eol
("\r" "\n"?) / "\n"
end
end
lonclass_spec.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
#!/usr/bin/env spec
# via "sudo gem install rspec"
 
require 'lonclass'
 
describe "Lonclass parser" do
it "should parse a line with a blank company name" do
str = "80276|[ ] |COMPANIES\n"
LonclassParser.parse(str).should == [{
:subjects=>[],
:message=>"COMPANIES",
:number=>"80276",
:company=>" "
}]
end
it "should parse a line with a company name" do
str = "80277|[A AND W] |A AND W (FAST FOOD COMPANY)\n"
LonclassParser.parse(str).should == [{
:subjects=>[],
:message=>"A AND W (FAST FOOD COMPANY)",
:number=>"80277",
:company=>"A AND W"
}]
end
it "should parse a line that has a number in the company name" do
str = "80342|[AIRE O2] |AIRE O2 (COMPANY)\n"
LonclassParser.parse(str).should == [{
:subjects=>[],
:message=>"AIRE O2 (COMPANY)",
:number=>"80342",
:company=>"AIRE O2"
}]
end
it "should parse a line with a company name and a quote in the message" do
str = "80279|[A J BRETT] |A' J BRETT (ANTIQUE RESTORERS)\n"
LonclassParser.parse(str).should == [{
:subjects=>[],
:message=>"A' J BRETT (ANTIQUE RESTORERS)",
:number=>"80279",
:company=>"A J BRETT"
}]
end
it "should parse a line with a company name and a subject number" do
str = "80285|[ABB TRANSPORTATION].007.004.761 |JOB LOSSES AT ABB TRANSPORTATION\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>".007.004.761", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil, :company_extension=>nil}
],
:message=>"JOB LOSSES AT ABB TRANSPORTATION",
:number=>"80285",
:company=>"ABB TRANSPORTATION"
}]
end
it "should parse a line with a company name and a subject number with a name extension" do
str = "80309|[AD DAF].008.02LEYLAND DAF |LLEYLAND DAF (COMPANY)\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>".008.02", :name_extension=>"LEYLAND DAF", :location_extension=>nil, :date_extension=>nil, :company_extension=>nil}
],
:message=>"LLEYLAND DAF (COMPANY)",
:number=>"80309",
:company=>"AD DAF"
}]
end
it "should parse a line with a company name and a conjoined subject number" do
str = "80720|[BEDFORD].008.24TRUCK.004.6 |DECLINE OF BEDFORD TRUCKS (COMPANY)\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>".008.24", :name_extension=>"TRUCK", :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
{:number=>'.004.6', :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
],
:message=>"DECLINE OF BEDFORD TRUCKS (COMPANY)",
:number=>"80720",
:company=>"BEDFORD"
}]
end
it "should parse a line with a company name and 2 related subject numbers" do
str = "80586|[BARLOW CLOWES]658.111:332.6 |BARLOW CLOWES INVESTMENT GROUP\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>"658.111", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
{:number=>'332.6', :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
],
:message=>"BARLOW CLOWES INVESTMENT GROUP",
:number=>"80586",
:company=>"BARLOW CLOWES"
}]
end
it "should parse a line with a company name and 2 related subject numbers" do
str = "80316|[ADIDAS]301.161.1:3-058.1:796.007.009.031:796.334.1.007(429) |ADIDAS BOOT MONEY SCANDAL (PAYMENTS TO AMATEUR WELSH RUGBY PLAYERS)\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>"301.161.1", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
{:number=>'3-058.1', :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
{:number=>'796.007.009.031', :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
{:number=>'796.334.1.007', :name_extension=>nil, :location_extension=>"429", :date_extension=>nil, :company_extension=>nil},
],
:message=>"ADIDAS BOOT MONEY SCANDAL (PAYMENTS TO AMATEUR WELSH RUGBY PLAYERS)",
:number=>"80316",
:company=>"ADIDAS"
}]
end
it "should parse a line with a company name and subject number with a date" do
str = "80404|[AMERICAN EXPRESS].093\"1991\" |AMERICAN EXPRESS BANK AWARD 1991\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>".093", :name_extension=>nil, :location_extension=>nil, :date_extension=>"1991", :company_extension=>nil},
],
:message=>"AMERICAN EXPRESS BANK AWARD 1991",
:number=>"80404",
:company=>"AMERICAN EXPRESS"
}]
end
it "should parse a line with a company name and subject number with a date" do
str = "270789|621.452.002.793.2 |JET ENGINE MUFFLERS\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>"621.452.002.793.2", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
],
:message=>"JET ENGINE MUFFLERS",
:number=>"270789",
:company=>nil
}]
end
it "should parse a line with a slash in the location extension" do
str = "270790|321.61(421/425)JAMES II |JAMES II (BRITISH KING)\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>"321.61", :name_extension=>"JAMES II", :location_extension=>"421/425", :date_extension=>nil, :company_extension=>nil},
],
:message=>"JAMES II (BRITISH KING)",
:number=>"270790",
:company=>nil
}]
end
it "should parse a line with a relation after a name extension" do
str = "80670|[BBC]654.192.77SUB:654.19:629.195 |BBC SUBSCRIPTION CHANNEL\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>"654.192.77", :name_extension=>"SUB", :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
{:number=>"654.19", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
{:number=>"629.195", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
],
:message=>"BBC SUBSCRIPTION CHANNEL",
:number=>"80670",
:company=>"BBC"
}]
end
it "should parse a line with a location extension after a name extension" do
str = "80681|[BBC]654.193ENGLISH(73) |BBC WORLD SERVICE RADIO BROADCAST TO USA\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>"654.193", :name_extension=>"ENGLISH", :location_extension=>"73", :date_extension=>nil, :company_extension=>nil},
],
:message=>"BBC WORLD SERVICE RADIO BROADCAST TO USA",
:number=>"80681",
:company=>"BBC"
}]
end
it "should parse a line with multiple subject numbers" do
str = "80809|[BNOC]338.532.31:665.4/.5(261.2) |BNOC CUT PRICE OF NORTH SEA OIL\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>"338.532.31", :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension=>nil},
{:number=>"665.4/.5", :name_extension=>nil, :location_extension=>'261.2', :date_extension=>nil, :company_extension=>nil},
],
:message=>"BNOC CUT PRICE OF NORTH SEA OIL",
:number=>"80809",
:company=>"BNOC"
}]
end
it "should parse a line with a company number followed by a location" do
str = "80862|[BRENT WALKER](047.1) |BRENT WALKER ANNUAL REPORTS\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>nil, :name_extension=>nil, :location_extension=>'047.1', :date_extension=>nil, :company_extension=>nil},
],
:message=>"BRENT WALKER ANNUAL REPORTS",
:number=>"80862",
:company=>"BRENT WALKER"
}]
end
it "should parse a line with a company number followed by another company number" do
str = "80923|[BRITISH LEYLAND].008.01[JAGUAR] |JAGUAR CARS LTD (BL SUB GROUP)\n"
LonclassParser.parse(str).should == [{
:subjects=>[
{:number=>'.008.01', :name_extension=>nil, :location_extension=>nil, :date_extension=>nil, :company_extension => 'JAGUAR'},
],
:message=>"JAGUAR CARS LTD (BL SUB GROUP)",
:number=>"80923",
:company=>"BRITISH LEYLAND"
}]
end
end

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.