Skip to content

Instantly share code, notes, and snippets.

@palkan
Created December 1, 2014 12:32
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save palkan/44837275114d66a24df4 to your computer and use it in GitHub Desktop.
Save palkan/44837275114d66a24df4 to your computer and use it in GitHub Desktop.
[ruby] cyrillic detection becnh
class TestCyrillic
class << self
RUSSIAN_CODES = (1040..1103).to_a + (32..64).to_a + (91..96).to_a + (123..126).to_a + [1025, 1105, 8470]
def cyrillic?(string)
result = true
string.force_encoding("UTF-8").each_char{|c| result &&= RUSSIAN_CODES.include?(c.ord)}
result
end
# \u2116 - №
# \u0410 - \u044f - cyrillics themself
# ... - other common symbols
RXP = /^[\u2116\u0020-\u0040\u005b-\u0060\u007b-\u007e\u00a2-\u00bf\u0100-\u0171\u0401\u0451\u0410-\u044f]+$/
def cyr?(string)
!!(string.force_encoding("UTF-8") =~ RXP)
end
end
end
class String
def cyrillic?
TestCyrillic.cyrillic?(self)
end
def cyr?
TestCyrillic.cyr?(self)
end
end
def run_cyrillic(*args)
args.each { |s| s.cyrillic? }
end
def run_cyr(*args)
args.each { |s| s.cyr? }
end
def assert(&block)
raise "assert failed" unless block.call
end
def assertNot(&block)
raise "assert failed" if block.call
end
assert{"Русские буквы".cyrillic?}
assert{"Русские буквы и !№%:,.;()_+=-0986654".cyrillic?}
assert{"!№%:,.;()_+=-0986654".cyrillic?}
assertNot{"Русские буквы и English letters".cyrillic?}
assertNot{"Only English letters".cyrillic?}
assertNot{"English letters and !№%:,.;()_+=-0986654".cyrillic?}
assert{"Русские буквы".cyr? }
assert{"Русские буквы и !№%:,.;()_+=-0986654".cyr? }
assert{"!№%:,.;()_+=-0986654".cyr? }
assertNot{"Русские буквы и English letters".cyr? }
assertNot{"Only English letters".cyr? }
assertNot{"English letters and !№%:,.;()_+=-0986654".cyr? }
require 'benchmark'
test_data = [
"Русские буквы",
"Русские буквы и !№%:,.;()_+=-0986654",
"!№%:,.;()_+=-0986654",
"Русские буквы и English letters",
"Only English letters",
"English letters and !№%:,.;()_+=-0986654"
]
n = 50000
Benchmark.bm do |x|
x.report { n.times { run_cyrillic(*test_data) } }
x.report { n.times { run_cyr(*test_data) }}
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment