Skip to content

Instantly share code, notes, and snippets.

@athoune
Created December 9, 2015 22:10
Show Gist options
  • Save athoune/123396c44638e14477fb to your computer and use it in GitHub Desktop.
Save athoune/123396c44638e14477fb to your computer and use it in GitHub Desktop.
Prefix index for massive regexp parsing, like Piwik does
require "set"
require 'regexp_parser'
require 'commons-collections4-4.1.jar'
module PreRegexp
def self.tokenize sentence
sentence.split(/[\s;()|]+/).sort.reverse
end
def self.prefixes tree, stack
first = (tree.type == :expression) ? tree.first : tree
type = first.type
if type == :literal
stack << first.text
elsif type == :group
first.expressions.each do |expression|
stack = prefixes(expression, stack)
end
elsif type == :meta
if first.token == :alternation
first.expressions.each do |expression|
stack = prefixes(expression, stack)
end
end
end
stack
end
class Index
attr_reader :r
def initialize
@trie = org.apache.commons.collections4.trie.PatriciaTrie.new
@regex = Array.new
@min = 9999
@r = 0 # Number of launched regexp
end
def add re
tree = Regexp::Parser.parse re
PreRegexp::prefixes(tree, Array.new).each do |pre|
pre = PreRegexp::tokenize(pre)[0]
@trie.put pre, @regex.length
l = pre.length
@min = l if l < @min
end
@regex << Regexp.new(re)
end
def match sentence
PreRegexp::tokenize(sentence).each do |token|
next if token.length < @min
bad = Set.new
toks = [token]
(0..2).each do |m|
min = @min + 2 - m
tok = token[0..min-1]
toks << tok if tok.length >= @min
end
toks.each do |tok|
@trie.prefixMap(tok).each do |prefix, poz|
next if bad.include? poz
r = @regex[poz]
@r += 1
return poz if r.match sentence
bad.add poz
end
end
end
nil
end
end
end
#!/usr/bin/env ruby
require "yaml"
require "pp"
load "pregexp.rb"
bots = YAML.load(File.read('bots.yml'))
idx = PreRegexp::Index.new
bots.each do |bot|
re = bot['regex']
idx.add re
end
pp idx.match "The big Yahoo Slurp"
pp idx.match "The 42 fat fake Yahooo"
pp idx.match "Where is my baidu"
pp idx.match "I am baiduspider"
pp idx.r
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment