Skip to content

Instantly share code, notes, and snippets.

@koshigoe
Last active February 7, 2019 03:29
Show Gist options
  • Save koshigoe/9efdda79b37c0ab8dae843aa38c18b35 to your computer and use it in GitHub Desktop.
Save koshigoe/9efdda79b37c0ab8dae843aa38c18b35 to your computer and use it in GitHub Desktop.
サロゲートペアなどを途中で切り詰めない様にしたい
require 'benchmark'
N = 10
str = "がぎぐげご" * 100
gra = "か\u3099き\u3099く\u3099け\u3099こ\u3099" * 100
Benchmark.bm(64) do |x|
x.report('String#size (1)') do |x|
N.times { str.size }
end
x.report('String#size (2)') do |x|
N.times { gra.size }
end
x.report('ActiveSupport::Multibyte::Unicode.unpack_graphemes (1)') do
N.times { ActiveSupport::Multibyte::Unicode.unpack_graphemes(str) }
end
x.report('ActiveSupport::Multibyte::Unicode.unpack_graphemes (2)') do
N.times { ActiveSupport::Multibyte::Unicode.unpack_graphemes(gra) }
end
x.report('String#scan.size (1)') do
N.times { str.scan(/\X/).size }
end
x.report('String#scan.size (2)') do
N.times { gra.scan(/\X/).size }
end
x.report('String#grapheme_clusters.size (1)') do
N.times { str.grapheme_clusters.size }
end
x.report('String#grapheme_clusters.size (2)') do
N.times { gra.grapheme_clusters.size }
end
x.report('String#each_grapheme_cluster.count (1)') do
N.times { str.each_grapheme_cluster.count }
end
x.report('String#each_grapheme_cluster.count (2)') do
N.times { gra.each_grapheme_cluster.count }
end
x.report('String#each_grapheme_cluster.size (1)') do
N.times { str.each_grapheme_cluster.size }
end
x.report('String#each_grapheme_cluster.size (2)') do
N.times { gra.each_grapheme_cluster.size }
end
end
__END__
user system total real
String#size (1) 0.000018 0.000008 0.000026 ( 0.000022)
String#size (2) 0.000012 0.000002 0.000014 ( 0.000012)
ActiveSupport::Multibyte::Unicode.unpack_graphemes (1) 2.384050 0.043771 2.427821 ( 2.440785)
ActiveSupport::Multibyte::Unicode.unpack_graphemes (2) 4.380538 0.015965 4.396503 ( 4.457205)
String#scan.size (1) 0.005429 0.000144 0.005573 ( 0.005697)
String#scan.size (2) 0.006267 0.000076 0.006343 ( 0.006353)
String#grapheme_clusters.size (1) 0.004079 0.000104 0.004183 ( 0.004222)
String#grapheme_clusters.size (2) 0.003742 0.000058 0.003800 ( 0.003826)
String#each_grapheme_cluster.count (1) 0.004119 0.000023 0.004142 ( 0.004185)
String#each_grapheme_cluster.count (2) 0.003672 0.000006 0.003678 ( 0.003688)
String#each_grapheme_cluster.size (1) 0.003329 0.000010 0.003339 ( 0.003425)
String#each_grapheme_cluster.size (2) 0.003561 0.000017 0.003578 ( 0.003624)
require 'benchmark'
N = 1_000
str = "がぎぐげご" * 100
gra = "か\u3099き\u3099く\u3099け\u3099こ\u3099" * 100
Benchmark.bm(80) do |x|
x.report('String#truncate (1)') do
N.times { str.truncate(100, omission: "...") }
end
x.report('String#truncate (2)') do
N.times { str.truncate(100, omission: "...", separator: "ぐ") }
end
x.report('String#truncate (3)') do
N.times { gra.truncate(100, omission: "\u{2702 fe0f}") }
end
x.report('String#truncate (4)') do
N.times { gra.truncate(100, omission: "\u{2702 fe0f}", separator: "く\u3099") }
end
x.report('String#truncate_graphemes_with_each_grapheme_cluster_count (1)') do
N.times { str.truncate_graphemes(100, omission: "...") }
end
x.report('String#truncate_graphemes_with_each_grapheme_cluster_count (2)') do
N.times { str.truncate_graphemes(100, omission: "...", separator: "ぐ") }
end
x.report('String#truncate_graphemes_with_each_grapheme_cluster_count (3)') do
N.times { gra.truncate_graphemes(100, omission: "\u{2702 fe0f}") }
end
x.report('String#truncate_graphemes_with_each_grapheme_cluster_count (4)') do
N.times { gra.truncate_graphemes(100, omission: "\u{2702 fe0f}", separator: "く\u3099") }
end
x.report('String#truncate_graphemes_with_each_grapheme_cluster_size (1)') do
N.times { str.truncate_graphemes_with_each_grapheme_cluster_size(100, omission: "...") }
end
x.report('String#truncate_graphemes_with_each_grapheme_cluster_size (2)') do
N.times { str.truncate_graphemes_with_each_grapheme_cluster_size(100, omission: "...", separator: "ぐ") }
end
x.report('String#truncate_graphemes_with_each_grapheme_cluster_size (3)') do
N.times { gra.truncate_graphemes_with_each_grapheme_cluster_size(100, omission: "\u{2702 fe0f}") }
end
x.report('String#truncate_graphemes_with_each_grapheme_cluster_size (4)') do
N.times { gra.truncate_graphemes_with_each_grapheme_cluster_size(100, omission: "\u{2702 fe0f}", separator: "く\u3099") }
end
x.report('String#truncate_graphemes_with_grapheme_clusters_size (1)') do
N.times { str.truncate_graphemes_with_grapheme_clusters_size(100, omission: "...") }
end
x.report('String#truncate_graphemes_with_grapheme_clusters_size (2)') do
N.times { str.truncate_graphemes_with_grapheme_clusters_size(100, omission: "...", separator: "ぐ") }
end
x.report('String#truncate_graphemes_with_grapheme_clusters_size (3)') do
N.times { gra.truncate_graphemes_with_grapheme_clusters_size(100, omission: "\u{2702 fe0f}") }
end
x.report('String#truncate_graphemes_with_grapheme_clusters_size (4)') do
N.times { gra.truncate_graphemes_with_grapheme_clusters_size(100, omission: "\u{2702 fe0f}", separator: "く\u3099") }
end
x.report('String#gsub (1)') do
N.times { gra.gsub(/./, '.') }
end
x.report('String#gsub (2)') do
N.times { gra.gsub(/\X/, '.') }
end
end
__END__
user system total real
String#truncate (1) 0.001988 0.000649 0.002637 ( 0.002635)
String#truncate (2) 0.003430 0.000627 0.004057 ( 0.004072)
String#truncate (3) 0.002081 0.000568 0.002649 ( 0.002652)
String#truncate (4) 0.004323 0.000713 0.005036 ( 0.005125)
String#truncate_graphemes_with_each_grapheme_cluster_count (1) 0.496839 0.035412 0.532251 ( 0.544658)
String#truncate_graphemes_with_each_grapheme_cluster_count (2) 0.546470 0.007729 0.554199 ( 0.568006)
String#truncate_graphemes_with_each_grapheme_cluster_count (3) 0.431042 0.001793 0.432835 ( 0.433713)
String#truncate_graphemes_with_each_grapheme_cluster_count (4) 0.422881 0.000698 0.423579 ( 0.424463)
String#truncate_graphemes_with_each_grapheme_cluster_size (1) 0.366217 0.004378 0.370595 ( 0.371266)
String#truncate_graphemes_with_each_grapheme_cluster_size (2) 0.527039 0.013160 0.540199 ( 0.550277)
String#truncate_graphemes_with_each_grapheme_cluster_size (3) 0.402696 0.010364 0.413060 ( 0.418723)
String#truncate_graphemes_with_each_grapheme_cluster_size (4) 0.390556 0.010561 0.401117 ( 0.401974)
String#truncate_graphemes_with_grapheme_clusters_size (1) 0.420647 0.014305 0.434952 ( 0.435687)
String#truncate_graphemes_with_grapheme_clusters_size (2) 0.527312 0.010127 0.537439 ( 0.544975)
String#truncate_graphemes_with_grapheme_clusters_size (3) 0.478933 0.006017 0.484950 ( 0.488538)
String#truncate_graphemes_with_grapheme_clusters_size (4) 0.426145 0.001380 0.427525 ( 0.428163)
String#gsub (1) 0.149356 0.000599 0.149955 ( 0.150370)
String#gsub (2) 0.290007 0.001179 0.291186 ( 0.291967)
class String
# 自分自身を指定された長さ(書記素クラスタ単位)で切り詰める。
#
# @see https://github.com/rails/rails/blob/bfd296dda797e597e8a54709d1cd331cdffaa9f7/activesupport/lib/active_support/core_ext/string/filters.rb#L48-L79
# @see https://github.com/rails/rails/blob/9cc463ed7b7be098602b72a98f72220ea6466ba2/activesupport/lib/active_support/core_ext/string/filters.rb#L81-L120
#
def truncate_graphemes(truncate_at, options = {})
return dup if each_grapheme_cluster.size <= truncate_at
omission = options[:omission] || '...'
length_with_room_for_omission = truncate_at - omission.each_grapheme_cluster.size
truncated = self[/\X{#{length_with_room_for_omission}}/]
if options[:separator] && stop = truncated.rindex(options[:separator])
truncated = truncated[0, stop]
end
"#{truncated}#{omission}"
end
end
RSpec.describe String do
describe '#truncate_graphemes' do
context 'omission 未指定(default)' do
context '切り詰め位置が切り詰め対象文字列の長さ(書記素クラスタ単位)未満' do
context '切り詰め位置が omission (...) の長さ(書記素クラスタ単位)以下' do
it 'omission だけを返す' do
# ààààà
original = "\u{0061 0300 0061 0300 0061 0300 0061 0300 0061 0300}"
expect(original.truncate_graphemes(0)).to eq '...'
expect(original.truncate_graphemes(3)).to eq '...'
end
end
context '切り詰め位置が omission (...) の長さ(書記素クラスタ単位)より大きい' do
it '書記素クラスタ単位で切り詰めた結果に omission を付与する' do
# ààààà
original = "\u{0061 0300}" * 5
expect(original.truncate_graphemes(4)).to eq "\u{0061 0300}..."
end
end
end
context '切り詰め位置が切り詰め対象文字列の長さ(書記素クラスタ単位)以上' do
it '切り詰めない' do
# ààààà
original = "\u{0061 0300}" * 5
expect(original.truncate_graphemes(5)).to eq original
expect(original.truncate_graphemes(6)).to eq original
end
end
end
context 'omission にサロゲートペアを含む' do
it 'omission の長さも書記素クラスタ単位で数えた上で切り詰めを行う' do
# ààààà
original = "\u{0061 0300}" * 5
# è
omission = "\u{0065 0300}"
expect(original.truncate_graphemes(3, omission: omission)).to eq "\u{0061 0300 0061 0300 0065 0300}"
end
end
context 'separator 指定あり(サロゲートペアを含む)' do
context '切り詰め対象文字列が separator を含む' do
context '切り詰め位置が separator である' do
it 'その位置で切り詰める' do
# è
separator = "\u{0065 0300}"
# àààèàààèààà
original = Array.new(3, "\u{0061 0300}" * 3).join(separator)
truncated = original.truncate_graphemes(8, separator: separator, omission: '')
expect(truncated).to eq "\u{0061 0300 0061 0300 0061 0300 0065 0300 0061 0300 0061 0300 0061 0300}"
end
end
context '切り詰め位置が separator でない' do
it '一つ前の separator 直前で切り詰める' do
# è
separator = "\u{0065 0300}"
# àààèàààèààà
original = Array.new(3, "\u{0061 0300}" * 3).join(separator)
truncated = original.truncate_graphemes(7, separator: separator, omission: '')
expect(truncated).to eq "\u{0061 0300 0061 0300 0061 0300}"
end
end
end
context '切り詰め対象文字が separator を含まない' do
it '切り詰める' do
# ààààà
original = "\u{0061 0300}" * 5
# è
separator = "\u{0065 0300}"
truncated = original.truncate_graphemes(3, separator: separator, omission: '')
expect(truncated).to eq "\u{0061 0300 0061 0300 0061 0300}"
end
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment