GerryG/gist:5f2993f262fbe14f57f2

## gistfile1.txt
# encoding=utf-8

REGEXPS = [[/[[:alnum:]]/, /\p{Alnum}/],
[/[[:alpha:]]/, /\p{Alpha}/],
[/[[:blank:]]/, /\p{Blank}/],
[/[[:cntrl:]]/, /\p{Cntrl}/],
[/[[:digit:]]/, /\p{Digit}/],
[/[[:graph:]]/, /\p{Graph}/],
[/[[:lower:]]/, /\p{Lower}/],
[/[[:print:]]/, /\p{Print}/],
[/[[:punct:]]/, /\p{Punct}/],
[/[[:space:]]/, /\p{Space}/],
[/[[:upper:]]/, /\p{Upper}/],
[/[[:xdigit:]]/, /\p{XDigit}/],
[/[[:word:]]/, /\p{Word}/],
[/[[:ascii:]]/, /\p{ASCII}/],
/\p{Any}/,
/\p{Assigned}/,
/\p{L}/,
/\p{Ll}/,
/\p{Lm}/,
/\p{Lo}/,
/\p{Lt}/,
/\p{Lu}/,
/\p{Lo}/,
/\p{M}/,
/\p{Mn}/,
/\p{Mc}/,
/\p{Me}/,
/\p{N}/,
/\p{Nd}/,
/\p{Nl}/,
/\p{No}/,
/\p{P}/,
/\p{Pc}/,
/\p{Pd}/,
/\p{Ps}/,
/\p{Pe}/,
/\p{Pi}/,
/\p{Pf}/,
/\p{Po}/,
/\p{S}/,
/\p{Sm}/,
/\p{Sc}/,
/\p{Sc}/,
/\p{Sk}/,
/\p{So}/,
/\p{Z}/,
/\p{Zs}/,
/\p{Zl}/,
/\p{Zp}/,
/\p{C}/,
/\p{Cc}/,
/\p{Cf}/,
/\p{Cn}/,
/\p{Co}/,
/\p{Cs}/,
]

CODEPOINTS = 0x10ffff

min = CODEPOINTS
max = 0

all_string = CODEPOINTS.times.map do |n|
  begin
    x=n.chr('UTF-8')
    max = n if n > max
    x
  rescue => e
    min = n if n < min
    nil
  end
end.compact*''

warn "codepoints #{CODEPOINTS}, min:#{min}, max:#{max}, #{all_string.codepoints.size}, #{all_string.size}"

#REGEXPS.each do |re|
#  if Array===re
#    matches1 = all_string.scan(re.first)
#    matches2 = all_string.scan(re.last)
#    warn "matches #{re}, #{matches1 == matches2}, #{matches1.size}, #{matches2.size}"
#  else
#    matches = all_string.scan(re)
#    warn "re is #{re}, chars: #{matches.size}"
#  end
#end

LIST_RE = [
/\p{Blank}/,
/\p{Space}/,
/\p{XDigit}/,
/\p{Lt}/,
/\p{Me}/,
]
LIST_RE.each do |re|
  matches = all_string.scan(re)
  warn "chars for #{re.inspect}, #{matches.size}, \n#{matches.map{|c| "#{c.codepoints.first}, #{c.inspect}[#{c}]"}*"\n"}"
end

=begin
codepoints 1114111, min:55296, max:1114110, 1112063, 1112063
matches [/[[:alnum:]]/, /\p{Alnum}/], true, 102619, 102619
matches [/[[:alpha:]]/, /\p{Alpha}/], true, 102159, 102159
matches [/[[:blank:]]/, /\p{Blank}/], true, 19, 19
matches [/[[:cntrl:]]/, /\p{Cntrl}/], true, 65, 65
matches [/[[:digit:]]/, /\p{Digit}/], true, 460, 460
matches [/[[:graph:]]/, /\p{Graph}/], true, 247564, 247564
matches [/[[:lower:]]/, /\p{Lower}/], true, 1934, 1934
matches [/[[:print:]]/, /\p{Print}/], true, 247582, 247582
matches [/[[:punct:]]/, /\p{Punct}/], true, 632, 632
matches [/[[:space:]]/, /\p{Space}/], true, 26, 26
matches [/[[:upper:]]/, /\p{Upper}/], true, 1483, 1483
matches [/[[:xdigit:]]/, /\p{XDigit}/], true, 22, 22
matches [/[[:word:]]/, /\p{Word}/], true, 103404, 103404
matches [/[[:ascii:]]/, /\p{ASCII}/], true, 128, 128
re is (\p{Any}), chars: 1112063
re is (\p{Assigned}), chars: 247649
re is (\p{L}), chars: 101013
re is (\p{Ll}), chars: 1751
re is (\p{Lm}), chars: 237
re is (\p{Lo}), chars: 97553
re is (\p{Lt}), chars: 31
re is (\p{Lu}), chars: 1441
re is (\p{Lo}), chars: 97553
re is (\p{M}), chars: 1645
re is (\p{Mn}), chars: 1280
re is (\p{Mc}), chars: 353
re is (\p{Me}), chars: 12
re is (\p{N}), chars: 1148
re is (\p{Nd}), chars: 460
re is (\p{Nl}), chars: 224
re is (\p{No}), chars: 464
re is (\p{P}), chars: 632
re is (\p{Pc}), chars: 10
re is (\p{Pd}), chars: 23
re is (\p{Ps}), chars: 72
re is (\p{Pe}), chars: 71
re is (\p{Pi}), chars: 12
re is (\p{Pf}), chars: 10
re is (\p{Po}), chars: 434
re is (\p{S}), chars: 5519
re is (\p{Sm}), chars: 952
re is (\p{Sc}), chars: 48
re is (\p{Sc}), chars: 48
re is (\p{Sk}), chars: 115
re is (\p{So}), chars: 4404
re is (\p{Z}), chars: 20
re is (\p{Zs}), chars: 18
re is (\p{Zl}), chars: 1
re is (\p{Zp}), chars: 1
re is (\p{C}), chars: 1002086
re is (\p{Cc}), chars: 65
re is (\p{Cf}), chars: 139
re is (\p{Cn}), chars: 864414
re is (\p{Co}), chars: 137468
re is (\p{Cs}), chars: 0
=end
	# encoding=utf-8

	REGEXPS = [[/[[:alnum:]]/, /\p{Alnum}/],
	[/[[:alpha:]]/, /\p{Alpha}/],
	[/[[:blank:]]/, /\p{Blank}/],
	[/[[:cntrl:]]/, /\p{Cntrl}/],
	[/[[:digit:]]/, /\p{Digit}/],
	[/[[:graph:]]/, /\p{Graph}/],
	[/[[:lower:]]/, /\p{Lower}/],
	[/[[:print:]]/, /\p{Print}/],
	[/[[:punct:]]/, /\p{Punct}/],
	[/[[:space:]]/, /\p{Space}/],
	[/[[:upper:]]/, /\p{Upper}/],
	[/[[:xdigit:]]/, /\p{XDigit}/],
	[/[[:word:]]/, /\p{Word}/],
	[/[[:ascii:]]/, /\p{ASCII}/],
	/\p{Any}/,
	/\p{Assigned}/,
	/\p{L}/,
	/\p{Ll}/,
	/\p{Lm}/,
	/\p{Lo}/,
	/\p{Lt}/,
	/\p{Lu}/,
	/\p{Lo}/,
	/\p{M}/,
	/\p{Mn}/,
	/\p{Mc}/,
	/\p{Me}/,
	/\p{N}/,
	/\p{Nd}/,
	/\p{Nl}/,
	/\p{No}/,
	/\p{P}/,
	/\p{Pc}/,
	/\p{Pd}/,
	/\p{Ps}/,
	/\p{Pe}/,
	/\p{Pi}/,
	/\p{Pf}/,
	/\p{Po}/,
	/\p{S}/,
	/\p{Sm}/,
	/\p{Sc}/,
	/\p{Sc}/,
	/\p{Sk}/,
	/\p{So}/,
	/\p{Z}/,
	/\p{Zs}/,
	/\p{Zl}/,
	/\p{Zp}/,
	/\p{C}/,
	/\p{Cc}/,
	/\p{Cf}/,
	/\p{Cn}/,
	/\p{Co}/,
	/\p{Cs}/,
	]

	CODEPOINTS = 0x10ffff

	min = CODEPOINTS
	max = 0

	all_string = CODEPOINTS.times.map do \|n\|
	begin
	x=n.chr('UTF-8')
	max = n if n > max
	x
	rescue => e
	min = n if n < min
	nil
	end
	end.compact*''

	warn "codepoints #{CODEPOINTS}, min:#{min}, max:#{max}, #{all_string.codepoints.size}, #{all_string.size}"

	#REGEXPS.each do \|re\|
	# if Array===re
	# matches1 = all_string.scan(re.first)
	# matches2 = all_string.scan(re.last)
	# warn "matches #{re}, #{matches1 == matches2}, #{matches1.size}, #{matches2.size}"
	# else
	# matches = all_string.scan(re)
	# warn "re is #{re}, chars: #{matches.size}"
	# end
	#end

	LIST_RE = [
	/\p{Blank}/,
	/\p{Space}/,
	/\p{XDigit}/,
	/\p{Lt}/,
	/\p{Me}/,
	]
	LIST_RE.each do \|re\|
	matches = all_string.scan(re)
	warn "chars for #{re.inspect}, #{matches.size}, \n#{matches.map{\|c\| "#{c.codepoints.first}, #{c.inspect}[#{c}]"}*"\n"}"
	end

	=begin
	codepoints 1114111, min:55296, max:1114110, 1112063, 1112063
	matches [/[[:alnum:]]/, /\p{Alnum}/], true, 102619, 102619
	matches [/[[:alpha:]]/, /\p{Alpha}/], true, 102159, 102159
	matches [/[[:blank:]]/, /\p{Blank}/], true, 19, 19
	matches [/[[:cntrl:]]/, /\p{Cntrl}/], true, 65, 65
	matches [/[[:digit:]]/, /\p{Digit}/], true, 460, 460
	matches [/[[:graph:]]/, /\p{Graph}/], true, 247564, 247564
	matches [/[[:lower:]]/, /\p{Lower}/], true, 1934, 1934
	matches [/[[:print:]]/, /\p{Print}/], true, 247582, 247582
	matches [/[[:punct:]]/, /\p{Punct}/], true, 632, 632
	matches [/[[:space:]]/, /\p{Space}/], true, 26, 26
	matches [/[[:upper:]]/, /\p{Upper}/], true, 1483, 1483
	matches [/[[:xdigit:]]/, /\p{XDigit}/], true, 22, 22
	matches [/[[:word:]]/, /\p{Word}/], true, 103404, 103404
	matches [/[[:ascii:]]/, /\p{ASCII}/], true, 128, 128
	re is (\p{Any}), chars: 1112063
	re is (\p{Assigned}), chars: 247649
	re is (\p{L}), chars: 101013
	re is (\p{Ll}), chars: 1751
	re is (\p{Lm}), chars: 237
	re is (\p{Lo}), chars: 97553
	re is (\p{Lt}), chars: 31
	re is (\p{Lu}), chars: 1441
	re is (\p{Lo}), chars: 97553
	re is (\p{M}), chars: 1645
	re is (\p{Mn}), chars: 1280
	re is (\p{Mc}), chars: 353
	re is (\p{Me}), chars: 12
	re is (\p{N}), chars: 1148
	re is (\p{Nd}), chars: 460
	re is (\p{Nl}), chars: 224
	re is (\p{No}), chars: 464
	re is (\p{P}), chars: 632
	re is (\p{Pc}), chars: 10
	re is (\p{Pd}), chars: 23
	re is (\p{Ps}), chars: 72
	re is (\p{Pe}), chars: 71
	re is (\p{Pi}), chars: 12
	re is (\p{Pf}), chars: 10
	re is (\p{Po}), chars: 434
	re is (\p{S}), chars: 5519
	re is (\p{Sm}), chars: 952
	re is (\p{Sc}), chars: 48
	re is (\p{Sc}), chars: 48
	re is (\p{Sk}), chars: 115
	re is (\p{So}), chars: 4404
	re is (\p{Z}), chars: 20
	re is (\p{Zs}), chars: 18
	re is (\p{Zl}), chars: 1
	re is (\p{Zp}), chars: 1
	re is (\p{C}), chars: 1002086
	re is (\p{Cc}), chars: 65
	re is (\p{Cf}), chars: 139
	re is (\p{Cn}), chars: 864414
	re is (\p{Co}), chars: 137468
	re is (\p{Cs}), chars: 0
	=end