Skip to content

Instantly share code, notes, and snippets.

@MaxLap
Forked from sdogruyol/short_source_rabin_karp.cr
Created January 13, 2017 00:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MaxLap/0ab65d0e1466a7ba1107f66d0983be52 to your computer and use it in GitHub Desktop.
Save MaxLap/0ab65d0e1466a7ba1107f66d0983be52 to your computer and use it in GitHub Desktop.
require "benchmark"
require "colorize"
require "http"
require "spec/dsl"
class String
private PRIME_RK = 2097169u32
def index_old(search : String, offset = 0)
offset += size if offset < 0
return nil if offset < 0
end_pos = bytesize - search.bytesize
reader = Char::Reader.new(self)
reader.each_with_index do |char, i|
if reader.pos <= end_pos
if i >= offset && (to_unsafe + reader.pos).memcmp(search.to_unsafe, search.bytesize) == 0
return i
end
else
break
end
end
end
def index_rabin_karp(search : String, offset = 0)
offset += size if offset < 0
return nil if offset < 0
return size < offset ? nil : offset if search.empty?
# Rabin-Karp algorithm
# https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
# calculate a rolling hash of search string (needle)
search_hash = 0u32
search.each_byte do |b|
search_hash = search_hash * PRIME_RK + b
end
pow = PRIME_RK ** search.bytesize
# Find start index with offset
char_index = 0
pointer = to_unsafe
end_pointer = pointer + bytesize
while char_index < offset && pointer < end_pointer
byte = pointer.value
if byte < 0x80
pointer += 1
elsif byte < 0xe0
pointer += 2
elsif byte < 0xf0
pointer += 3
else
pointer += 4
end
char_index += 1
end
head_pointer = pointer
# calculate a rolling hash of this text (heystack)
hash = 0u32
hash_end_pointer = pointer + search.bytesize
return if hash_end_pointer > end_pointer
while pointer < hash_end_pointer
hash = hash * PRIME_RK + pointer.value
pointer += 1
end
while true
# check hash equality and real string equality
if hash == search_hash && head_pointer.memcmp(search.to_unsafe, search.bytesize) == 0
return char_index
end
return if pointer >= end_pointer
byte = head_pointer.value
char_index += 1 if (byte & 0x80) == 0 || (byte & 0x40) != 0
# update a rolling hash of this text (heystack)
hash = hash * PRIME_RK + pointer.value - pow * byte
head_pointer += 1
pointer += 1
end
nil
end
def index_rabin_karp2(search : String, offset = 0)
offset += size if offset < 0
return nil if offset < 0
return size < offset ? nil : offset if search.empty?
# Rabin-Karp algorithm
# https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
# calculate a rolling hash of search string (needle)
search_hash = 0u32
search.each_byte do |b|
search_hash = search_hash * PRIME_RK + b
end
pow = PRIME_RK ** search.bytesize
# Find start index with offset
char_index = 0
pointer = to_unsafe
end_pointer = pointer + bytesize
while char_index < offset && pointer < end_pointer
byte = pointer.value
if byte < 0x80
pointer += 1
elsif byte < 0xe0
pointer += 2
elsif byte < 0xf0
pointer += 3
else
pointer += 4
end
char_index += 1
end
head_pointer = pointer
# calculate a rolling hash of this text (heystack)
hash = 0u32
hash_end_pointer = pointer + search.bytesize
return if hash_end_pointer > end_pointer
while pointer < hash_end_pointer
hash = hash * PRIME_RK + pointer.value
pointer += 1
end
while true
# check hash equality and real string equality
if hash == search_hash && head_pointer.memcmp(search.to_unsafe, search.bytesize) == 0
return char_index
end
return if pointer >= end_pointer
byte = head_pointer.value
if byte < 0x80
hash = hash * PRIME_RK + pointer.value - pow * byte
pointer += 1
head_pointer += 1
elsif byte < 0xe0
hash = hash * PRIME_RK + pointer.value - pow * byte
pointer += 1
head_pointer += 1
byte = head_pointer.value
hash = hash * PRIME_RK + pointer.value - pow * byte
pointer += 1
head_pointer += 1
elsif byte < 0xf0
hash = hash * PRIME_RK + pointer.value - pow * byte
pointer += 1
head_pointer += 1
byte = head_pointer.value
hash = hash * PRIME_RK + pointer.value - pow * byte
pointer += 1
head_pointer += 1
byte = head_pointer.value
hash = hash * PRIME_RK + pointer.value - pow * byte
pointer += 1
head_pointer += 1
else
hash = hash * PRIME_RK + pointer.value - pow * byte
pointer += 1
head_pointer += 1
byte = head_pointer.value
hash = hash * PRIME_RK + pointer.value - pow * byte
pointer += 1
head_pointer += 1
byte = head_pointer.value
hash = hash * PRIME_RK + pointer.value - pow * byte
pointer += 1
head_pointer += 1
byte = head_pointer.value
hash = hash * PRIME_RK + pointer.value - pow * byte
pointer += 1
head_pointer += 1
end
char_index += 1
end
nil
end
end
unless File.exists?("mems.txt")
print "Downloading mems.txt (The Memoirs of Sherlock Holmes)...".colorize(:red)
File.open("mems.txt", "w") do |f|
HTTP::Client.get("https://sherlock-holm.es/stories/plain-text/mems.txt") do |response|
IO.copy response.body_io, f
end
end
puts " done".colorize(:green)
end
source = File.read("mems.txt")
short_source = <<-SHORT
Crystal is great, i love Crystal.
Lorem dolor sip amet, Crystal is great
SHORT
target = "crystal"
n = source.index(target)
short_n = short_source.index(target)
puts "Target for long: #{n}"
puts "Target for short: #{short_n}"
Benchmark.ips do |x|
x.report("index_old") { source.index_old(target).should eq n }
x.report("index_rabin_karp") { source.index_rabin_karp(target).should eq n }
x.report("index_rabin_karp2") { source.index_rabin_karp2(target).should eq n }
x.report("index_short") { short_source.index_old(target).should eq short_n }
x.report("index_short_rabin_karp") { short_source.index_rabin_karp(target).should eq short_n }
x.report("index_short_rabin_karp2") { short_source.index_rabin_karp2(target).should eq short_n }
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment