Skip to content

Instantly share code, notes, and snippets.

@GerryG
Created December 31, 2014 20:10
Show Gist options
  • Save GerryG/5f2993f262fbe14f57f2 to your computer and use it in GitHub Desktop.
Save GerryG/5f2993f262fbe14f57f2 to your computer and use it in GitHub Desktop.
Ruby Character Classes
# encoding=utf-8
REGEXPS = [[/[[:alnum:]]/, /\p{Alnum}/],
[/[[:alpha:]]/, /\p{Alpha}/],
[/[[:blank:]]/, /\p{Blank}/],
[/[[:cntrl:]]/, /\p{Cntrl}/],
[/[[:digit:]]/, /\p{Digit}/],
[/[[:graph:]]/, /\p{Graph}/],
[/[[:lower:]]/, /\p{Lower}/],
[/[[:print:]]/, /\p{Print}/],
[/[[:punct:]]/, /\p{Punct}/],
[/[[:space:]]/, /\p{Space}/],
[/[[:upper:]]/, /\p{Upper}/],
[/[[:xdigit:]]/, /\p{XDigit}/],
[/[[:word:]]/, /\p{Word}/],
[/[[:ascii:]]/, /\p{ASCII}/],
/\p{Any}/,
/\p{Assigned}/,
/\p{L}/,
/\p{Ll}/,
/\p{Lm}/,
/\p{Lo}/,
/\p{Lt}/,
/\p{Lu}/,
/\p{Lo}/,
/\p{M}/,
/\p{Mn}/,
/\p{Mc}/,
/\p{Me}/,
/\p{N}/,
/\p{Nd}/,
/\p{Nl}/,
/\p{No}/,
/\p{P}/,
/\p{Pc}/,
/\p{Pd}/,
/\p{Ps}/,
/\p{Pe}/,
/\p{Pi}/,
/\p{Pf}/,
/\p{Po}/,
/\p{S}/,
/\p{Sm}/,
/\p{Sc}/,
/\p{Sc}/,
/\p{Sk}/,
/\p{So}/,
/\p{Z}/,
/\p{Zs}/,
/\p{Zl}/,
/\p{Zp}/,
/\p{C}/,
/\p{Cc}/,
/\p{Cf}/,
/\p{Cn}/,
/\p{Co}/,
/\p{Cs}/,
]
CODEPOINTS = 0x10ffff
min = CODEPOINTS
max = 0
all_string = CODEPOINTS.times.map do |n|
begin
x=n.chr('UTF-8')
max = n if n > max
x
rescue => e
min = n if n < min
nil
end
end.compact*''
warn "codepoints #{CODEPOINTS}, min:#{min}, max:#{max}, #{all_string.codepoints.size}, #{all_string.size}"
#REGEXPS.each do |re|
# if Array===re
# matches1 = all_string.scan(re.first)
# matches2 = all_string.scan(re.last)
# warn "matches #{re}, #{matches1 == matches2}, #{matches1.size}, #{matches2.size}"
# else
# matches = all_string.scan(re)
# warn "re is #{re}, chars: #{matches.size}"
# end
#end
LIST_RE = [
/\p{Blank}/,
/\p{Space}/,
/\p{XDigit}/,
/\p{Lt}/,
/\p{Me}/,
]
LIST_RE.each do |re|
matches = all_string.scan(re)
warn "chars for #{re.inspect}, #{matches.size}, \n#{matches.map{|c| "#{c.codepoints.first}, #{c.inspect}[#{c}]"}*"\n"}"
end
=begin
codepoints 1114111, min:55296, max:1114110, 1112063, 1112063
matches [/[[:alnum:]]/, /\p{Alnum}/], true, 102619, 102619
matches [/[[:alpha:]]/, /\p{Alpha}/], true, 102159, 102159
matches [/[[:blank:]]/, /\p{Blank}/], true, 19, 19
matches [/[[:cntrl:]]/, /\p{Cntrl}/], true, 65, 65
matches [/[[:digit:]]/, /\p{Digit}/], true, 460, 460
matches [/[[:graph:]]/, /\p{Graph}/], true, 247564, 247564
matches [/[[:lower:]]/, /\p{Lower}/], true, 1934, 1934
matches [/[[:print:]]/, /\p{Print}/], true, 247582, 247582
matches [/[[:punct:]]/, /\p{Punct}/], true, 632, 632
matches [/[[:space:]]/, /\p{Space}/], true, 26, 26
matches [/[[:upper:]]/, /\p{Upper}/], true, 1483, 1483
matches [/[[:xdigit:]]/, /\p{XDigit}/], true, 22, 22
matches [/[[:word:]]/, /\p{Word}/], true, 103404, 103404
matches [/[[:ascii:]]/, /\p{ASCII}/], true, 128, 128
re is (\p{Any}), chars: 1112063
re is (\p{Assigned}), chars: 247649
re is (\p{L}), chars: 101013
re is (\p{Ll}), chars: 1751
re is (\p{Lm}), chars: 237
re is (\p{Lo}), chars: 97553
re is (\p{Lt}), chars: 31
re is (\p{Lu}), chars: 1441
re is (\p{Lo}), chars: 97553
re is (\p{M}), chars: 1645
re is (\p{Mn}), chars: 1280
re is (\p{Mc}), chars: 353
re is (\p{Me}), chars: 12
re is (\p{N}), chars: 1148
re is (\p{Nd}), chars: 460
re is (\p{Nl}), chars: 224
re is (\p{No}), chars: 464
re is (\p{P}), chars: 632
re is (\p{Pc}), chars: 10
re is (\p{Pd}), chars: 23
re is (\p{Ps}), chars: 72
re is (\p{Pe}), chars: 71
re is (\p{Pi}), chars: 12
re is (\p{Pf}), chars: 10
re is (\p{Po}), chars: 434
re is (\p{S}), chars: 5519
re is (\p{Sm}), chars: 952
re is (\p{Sc}), chars: 48
re is (\p{Sc}), chars: 48
re is (\p{Sk}), chars: 115
re is (\p{So}), chars: 4404
re is (\p{Z}), chars: 20
re is (\p{Zs}), chars: 18
re is (\p{Zl}), chars: 1
re is (\p{Zp}), chars: 1
re is (\p{C}), chars: 1002086
re is (\p{Cc}), chars: 65
re is (\p{Cf}), chars: 139
re is (\p{Cn}), chars: 864414
re is (\p{Co}), chars: 137468
re is (\p{Cs}), chars: 0
=end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment