Ruby Character Classes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding=utf-8 | |
REGEXPS = [[/[[:alnum:]]/, /\p{Alnum}/], | |
[/[[:alpha:]]/, /\p{Alpha}/], | |
[/[[:blank:]]/, /\p{Blank}/], | |
[/[[:cntrl:]]/, /\p{Cntrl}/], | |
[/[[:digit:]]/, /\p{Digit}/], | |
[/[[:graph:]]/, /\p{Graph}/], | |
[/[[:lower:]]/, /\p{Lower}/], | |
[/[[:print:]]/, /\p{Print}/], | |
[/[[:punct:]]/, /\p{Punct}/], | |
[/[[:space:]]/, /\p{Space}/], | |
[/[[:upper:]]/, /\p{Upper}/], | |
[/[[:xdigit:]]/, /\p{XDigit}/], | |
[/[[:word:]]/, /\p{Word}/], | |
[/[[:ascii:]]/, /\p{ASCII}/], | |
/\p{Any}/, | |
/\p{Assigned}/, | |
/\p{L}/, | |
/\p{Ll}/, | |
/\p{Lm}/, | |
/\p{Lo}/, | |
/\p{Lt}/, | |
/\p{Lu}/, | |
/\p{Lo}/, | |
/\p{M}/, | |
/\p{Mn}/, | |
/\p{Mc}/, | |
/\p{Me}/, | |
/\p{N}/, | |
/\p{Nd}/, | |
/\p{Nl}/, | |
/\p{No}/, | |
/\p{P}/, | |
/\p{Pc}/, | |
/\p{Pd}/, | |
/\p{Ps}/, | |
/\p{Pe}/, | |
/\p{Pi}/, | |
/\p{Pf}/, | |
/\p{Po}/, | |
/\p{S}/, | |
/\p{Sm}/, | |
/\p{Sc}/, | |
/\p{Sc}/, | |
/\p{Sk}/, | |
/\p{So}/, | |
/\p{Z}/, | |
/\p{Zs}/, | |
/\p{Zl}/, | |
/\p{Zp}/, | |
/\p{C}/, | |
/\p{Cc}/, | |
/\p{Cf}/, | |
/\p{Cn}/, | |
/\p{Co}/, | |
/\p{Cs}/, | |
] | |
CODEPOINTS = 0x10ffff | |
min = CODEPOINTS | |
max = 0 | |
all_string = CODEPOINTS.times.map do |n| | |
begin | |
x=n.chr('UTF-8') | |
max = n if n > max | |
x | |
rescue => e | |
min = n if n < min | |
nil | |
end | |
end.compact*'' | |
warn "codepoints #{CODEPOINTS}, min:#{min}, max:#{max}, #{all_string.codepoints.size}, #{all_string.size}" | |
#REGEXPS.each do |re| | |
# if Array===re | |
# matches1 = all_string.scan(re.first) | |
# matches2 = all_string.scan(re.last) | |
# warn "matches #{re}, #{matches1 == matches2}, #{matches1.size}, #{matches2.size}" | |
# else | |
# matches = all_string.scan(re) | |
# warn "re is #{re}, chars: #{matches.size}" | |
# end | |
#end | |
LIST_RE = [ | |
/\p{Blank}/, | |
/\p{Space}/, | |
/\p{XDigit}/, | |
/\p{Lt}/, | |
/\p{Me}/, | |
] | |
LIST_RE.each do |re| | |
matches = all_string.scan(re) | |
warn "chars for #{re.inspect}, #{matches.size}, \n#{matches.map{|c| "#{c.codepoints.first}, #{c.inspect}[#{c}]"}*"\n"}" | |
end | |
=begin | |
codepoints 1114111, min:55296, max:1114110, 1112063, 1112063 | |
matches [/[[:alnum:]]/, /\p{Alnum}/], true, 102619, 102619 | |
matches [/[[:alpha:]]/, /\p{Alpha}/], true, 102159, 102159 | |
matches [/[[:blank:]]/, /\p{Blank}/], true, 19, 19 | |
matches [/[[:cntrl:]]/, /\p{Cntrl}/], true, 65, 65 | |
matches [/[[:digit:]]/, /\p{Digit}/], true, 460, 460 | |
matches [/[[:graph:]]/, /\p{Graph}/], true, 247564, 247564 | |
matches [/[[:lower:]]/, /\p{Lower}/], true, 1934, 1934 | |
matches [/[[:print:]]/, /\p{Print}/], true, 247582, 247582 | |
matches [/[[:punct:]]/, /\p{Punct}/], true, 632, 632 | |
matches [/[[:space:]]/, /\p{Space}/], true, 26, 26 | |
matches [/[[:upper:]]/, /\p{Upper}/], true, 1483, 1483 | |
matches [/[[:xdigit:]]/, /\p{XDigit}/], true, 22, 22 | |
matches [/[[:word:]]/, /\p{Word}/], true, 103404, 103404 | |
matches [/[[:ascii:]]/, /\p{ASCII}/], true, 128, 128 | |
re is (\p{Any}), chars: 1112063 | |
re is (\p{Assigned}), chars: 247649 | |
re is (\p{L}), chars: 101013 | |
re is (\p{Ll}), chars: 1751 | |
re is (\p{Lm}), chars: 237 | |
re is (\p{Lo}), chars: 97553 | |
re is (\p{Lt}), chars: 31 | |
re is (\p{Lu}), chars: 1441 | |
re is (\p{Lo}), chars: 97553 | |
re is (\p{M}), chars: 1645 | |
re is (\p{Mn}), chars: 1280 | |
re is (\p{Mc}), chars: 353 | |
re is (\p{Me}), chars: 12 | |
re is (\p{N}), chars: 1148 | |
re is (\p{Nd}), chars: 460 | |
re is (\p{Nl}), chars: 224 | |
re is (\p{No}), chars: 464 | |
re is (\p{P}), chars: 632 | |
re is (\p{Pc}), chars: 10 | |
re is (\p{Pd}), chars: 23 | |
re is (\p{Ps}), chars: 72 | |
re is (\p{Pe}), chars: 71 | |
re is (\p{Pi}), chars: 12 | |
re is (\p{Pf}), chars: 10 | |
re is (\p{Po}), chars: 434 | |
re is (\p{S}), chars: 5519 | |
re is (\p{Sm}), chars: 952 | |
re is (\p{Sc}), chars: 48 | |
re is (\p{Sc}), chars: 48 | |
re is (\p{Sk}), chars: 115 | |
re is (\p{So}), chars: 4404 | |
re is (\p{Z}), chars: 20 | |
re is (\p{Zs}), chars: 18 | |
re is (\p{Zl}), chars: 1 | |
re is (\p{Zp}), chars: 1 | |
re is (\p{C}), chars: 1002086 | |
re is (\p{Cc}), chars: 65 | |
re is (\p{Cf}), chars: 139 | |
re is (\p{Cn}), chars: 864414 | |
re is (\p{Co}), chars: 137468 | |
re is (\p{Cs}), chars: 0 | |
=end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment