Skip to content

Instantly share code, notes, and snippets.

@asakura
Created March 11, 2024 20:54
Show Gist options
  • Save asakura/0a61b18587bef406de1b3f591b9a6126 to your computer and use it in GitHub Desktop.
Save asakura/0a61b18587bef406de1b3f591b9a6126 to your computer and use it in GitHub Desktop.

Is UTF8 string valid?

Mix.install([:benchee, :benchee_markdown])

Section

defmodule BulkMatch32 do
  defguardp is_inner_byte(c) when c >= 128 and c < 192

  def valid?(str), do: do_valid?(str)

  defp do_valid?(<<a::32, rest::binary>>) when Bitwise.band(a, 0x80808080) == 0,
    do: do_valid?(rest)

  defp do_valid?(<<a::8, rest::binary>>) when a < 128, do: do_valid?(rest)

  defp do_valid?(<<a::8, b::8, rest::binary>>) when a >= 192 and a < 224 and is_inner_byte(b),
    do: do_valid?(rest)

  defp do_valid?(<<a::8, b::8, c::8, rest::binary>>)
       when a >= 224 and a < 240 and is_inner_byte(b) and is_inner_byte(c),
       do: do_valid?(rest)

  defp do_valid?(<<a::8, b::8, c::8, d::8, rest::binary>>)
       when a >= 240 and is_inner_byte(b) and is_inner_byte(c) and is_inner_byte(d),
       do: do_valid?(rest)

  defp do_valid?(<<>>), do: true
  defp do_valid?(_str), do: false
end
defmodule BulkMatchBandBor do
  def valid?(<<a::8, rest::binary>>) when a < 128,
    do: valid?(rest)

  def valid?(<<a::16, rest::binary>>)
      when Bitwise.band(a, 0b1100000010000000) == 49280 and
             Bitwise.bor(a, 0b1101111110111111) == 57279,
      do: valid?(rest)

  def valid?(<<a::24, rest::binary>>)
      when Bitwise.band(a, 0b111000001000000010000000) == 14_712_960 and
             Bitwise.bor(a, 0b111011111011111110111111) == 15_712_191,
      do: valid?(rest)

  def valid?(<<a::32, rest::binary>>) do
    cond do
      Bitwise.band(a, 0b11110000100000001000000010000000) == 4_034_953_344 and
          Bitwise.bor(a, 0b11110111101111111011111110111111) == 4_156_538_815 ->
        valid?(rest)

      Bitwise.band(a, 0x80808080) == 0 ->
        valid?(rest)

      :otherwise ->
        false
    end
  end

  def valid?(<<>>), do: true
  def valid?(_str), do: false
end

BulkMatchBandBor.valid?("a") == true and
  BulkMatchBandBor.valid?("ø") == true and
  BulkMatchBandBor.valid?(<<0xFFFF::16>>) == false and
  BulkMatchBandBor.valid?(<<0xEF, 0xB7, 0x90>>) == true and
  BulkMatchBandBor.valid?("asd" <> <<0xFFFF::16>>) == false and
  BulkMatchBandBor.valid?("$") and
  BulkMatchBandBor.valid?("£") and
  BulkMatchBandBor.valid?("€") and
  BulkMatchBandBor.valid?("𐍈")
defmodule BulkMatchBandBor2 do
  def valid?(<<a::8, rest::binary>>) do
    cond do
      a < 128 -> valid?(rest)
      a > 191 and a < 224 -> valid?(:two, rest)
      a > 223 and a < 240 -> valid?(:three, rest)
      a > 239 -> valid?(:four, rest)
    end
  end

  def valid?(<<>>), do: true
  def valid?(_str), do: false

  @compile {:inline, valid?: 2}

  def valid?(:two, <<a::8, rest::binary>>)
      when Bitwise.band(a, 0b10000000) == 128 and
             Bitwise.bor(a, 0b10111111) == 191,
      do: valid?(rest)

  def valid?(:three, <<a::16, rest::binary>>)
      when Bitwise.band(a, 0b1000000010000000) == 32896 and
             Bitwise.bor(a, 0b1011111110111111) == 49087,
      do: valid?(rest)

  def valid?(:four, <<a::24, rest::binary>>)
      when Bitwise.band(a, 0b100000001000000010000000) == 8_421_504 and
             Bitwise.bor(a, 0b101111111011111110111111) == 12_566_463,
      do: valid?(rest)

  def valid?(_, _), do: false
end

BulkMatchBandBor2.valid?("a") == true and
  BulkMatchBandBor2.valid?("ø") == true and
  BulkMatchBandBor2.valid?(<<0xFFFF::16>>) == false and
  BulkMatchBandBor2.valid?(<<0xEF, 0xB7, 0x90>>) == true and
  BulkMatchBandBor2.valid?("asd" <> <<0xFFFF::16>>) == false and
  BulkMatchBandBor2.valid?("$") and
  BulkMatchBandBor2.valid?("£") and
  BulkMatchBandBor2.valid?("€") and
  BulkMatchBandBor2.valid?("𐍈")
defmodule BulkMatchBandBor3 do
  def valid?(<<a::8, rest::binary>>) do
    cond do
      a < 128 ->
        valid_one?(rest)

      a > 191 and a < 224 ->
        valid_two?(rest)

      a > 223 and a < 240 ->
        valid_three?(rest)

      a > 239 ->
        valid_four?(rest)
    end
  end

  def valid?(<<>>), do: true
  def valid?(_str), do: false

  @compile {:inline, valid_two?: 1}

  def valid_two?(<<a::8, rest::binary>>)
      when a < 192,
      do: valid?(rest)

  def valid_two?(_),
    do: false

  @compile {:inline, valid_three?: 1}

  def valid_three?(<<a::16, rest::binary>>)
      when Bitwise.band(a, 0x8080) == 0x8080 and
             Bitwise.bor(a, 0xBFBF) == 0xBFBF,
      do: valid?(rest)

  def valid_three?(_),
    do: false

  @compile {:inline, valid_four?: 1}

  def valid_four?(<<a::24, rest::binary>>)
      when Bitwise.band(a, 0x808080) == 0x808080 and
             Bitwise.bor(a, 0xBFBFBF) == 0xBFBFBF,
      do: valid?(rest)

  def valid_four?(_),
    do: false

  @compile {:inline, valid_one?: 1}
  def valid_one?(<<a::56, rest::binary>>)
      when Bitwise.band(a, 0x80808080808080) == 0,
      do: valid?(rest)

  def valid_one?(rest),
    do: valid?(rest)
end

BulkMatchBandBor3.valid?("a") == true and
  BulkMatchBandBor3.valid?("ø") == true and
  BulkMatchBandBor3.valid?(<<0xFFFF::16>>) == false and
  BulkMatchBandBor3.valid?(<<0xEF, 0xB7, 0x90>>) == true and
  BulkMatchBandBor3.valid?("asd" <> <<0xFFFF::16>>) == false and
  BulkMatchBandBor3.valid?("$") and
  BulkMatchBandBor3.valid?("£") and
  BulkMatchBandBor3.valid?("€") and
  BulkMatchBandBor3.valid?("𐍈")
defmodule Cowboy do
  def valid?(str), do: validate_utf8(str, 0) == 0

  defp validate_utf8(<<>>, state), do: state
  defp validate_utf8(<<c::8, rest::bits>>, 0) when c < 128, do: validate_utf8(rest, 0)

  defp validate_utf8(<<c::8, rest::bits>>, 2) when c >= 128 and c < 144,
    do: validate_utf8(rest, 0)

  defp validate_utf8(<<c::8, rest::bits>>, 3) when c >= 128 and c < 144,
    do: validate_utf8(rest, 2)

  defp validate_utf8(<<c::8, rest::bits>>, 5) when c >= 128 and c < 144,
    do: validate_utf8(rest, 2)

  defp validate_utf8(<<c::8, rest::bits>>, 7) when c >= 128 and c < 144,
    do: validate_utf8(rest, 3)

  defp validate_utf8(<<c::8, rest::bits>>, 8) when c >= 128 and c < 144,
    do: validate_utf8(rest, 3)

  defp validate_utf8(<<c::8, rest::bits>>, 2) when c >= 144 and c < 160,
    do: validate_utf8(rest, 0)

  defp validate_utf8(<<c::8, rest::bits>>, 3) when c >= 144 and c < 160,
    do: validate_utf8(rest, 2)

  defp validate_utf8(<<c::8, rest::bits>>, 5) when c >= 144 and c < 160,
    do: validate_utf8(rest, 2)

  defp validate_utf8(<<c::8, rest::bits>>, 6) when c >= 144 and c < 160,
    do: validate_utf8(rest, 3)

  defp validate_utf8(<<c::8, rest::bits>>, 7) when c >= 144 and c < 160,
    do: validate_utf8(rest, 3)

  defp validate_utf8(<<c::8, rest::bits>>, 2) when c >= 160 and c < 192,
    do: validate_utf8(rest, 0)

  defp validate_utf8(<<c::8, rest::bits>>, 3) when c >= 160 and c < 192,
    do: validate_utf8(rest, 2)

  defp validate_utf8(<<c::8, rest::bits>>, 4) when c >= 160 and c < 192,
    do: validate_utf8(rest, 2)

  defp validate_utf8(<<c::8, rest::bits>>, 6) when c >= 160 and c < 192,
    do: validate_utf8(rest, 3)

  defp validate_utf8(<<c::8, rest::bits>>, 7) when c >= 160 and c < 192,
    do: validate_utf8(rest, 3)

  defp validate_utf8(<<c::8, rest::bits>>, 0) when c >= 194 and c < 224,
    do: validate_utf8(rest, 2)

  defp validate_utf8(<<224::8, rest::bits>>, 0), do: validate_utf8(rest, 4)

  defp validate_utf8(<<c::8, rest::bits>>, 0) when c >= 225 and c < 237,
    do: validate_utf8(rest, 3)

  defp validate_utf8(<<237::8, rest::bits>>, 0), do: validate_utf8(rest, 5)

  defp validate_utf8(<<c::8, rest::bits>>, 0) when c == 238 and c == 239,
    do: validate_utf8(rest, 3)

  defp validate_utf8(<<240::8, rest::bits>>, 0), do: validate_utf8(rest, 6)

  defp validate_utf8(<<c::8, rest::bits>>, 0) when c == 241 and c == 242 and c == 243,
    do: validate_utf8(rest, 7)

  defp validate_utf8(<<244::8, rest::bits>>, 0), do: validate_utf8(rest, 8)
  defp validate_utf8(_, _), do: 1
end
defmodule New.Macro do
  defmacro utf8_accept() do
    quote do
      0
    end
  end

  defmacro utf8_reject() do
    quote do
      12
    end
  end

  defguard are_all_ascii_plain(b1, b2, b3, b4, b5, b6, b7, b8)
           when b1 <= 127 and
                  b2 <= 127 and
                  b3 <= 127 and
                  b4 <= 127 and
                  b5 <= 127 and
                  b6 <= 127 and
                  b7 <= 127 and
                  b8 <= 127

  # This is an adapted table from "Flexible and Economical UTF-8 Decoding" by Bjoern Hoehrmann.
  # http://bjoern.hoehrmann.de/utf-8/decoder/dfa/

  # Map character to character class
  defmacro utf8t() do
    quote do
      {
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        8,
        8,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        10,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        4,
        3,
        3,
        11,
        6,
        6,
        6,
        5,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8
      }
    end
  end

  # Transition table mapping combination of state & class to next state
  defmacro utf8s() do
    quote do
      {
        12,
        24,
        36,
        60,
        96,
        84,
        12,
        12,
        12,
        48,
        72,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        0,
        12,
        12,
        12,
        12,
        12,
        0,
        12,
        0,
        12,
        12,
        12,
        24,
        12,
        12,
        12,
        12,
        12,
        24,
        12,
        24,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        24,
        12,
        12,
        12,
        12,
        12,
        24,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        24,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        36,
        12,
        36,
        12,
        12,
        12,
        36,
        12,
        12,
        12,
        12,
        12,
        36,
        12,
        36,
        12,
        12,
        12,
        36,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12
      }
    end
  end

  # Optimisation for 1st byte direct state lookup,
  # we know starting state is 0 and ASCII bytes were already handled
  defmacro utf8s0() do
    quote do
      {
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        48,
        36,
        36,
        36,
        36,
        36,
        36,
        36,
        36,
        36,
        36,
        36,
        36,
        60,
        36,
        36,
        72,
        84,
        84,
        84,
        96,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12
      }
    end
  end
end
defmodule New do
  import New.Macro

  def valid?(<<byte, rest::bits>>) when byte <= 127,
    do: valid?(rest)

  def valid?(<<byte, rest::bytes>>) do
    case elem(utf8s0(), byte - 127 - 1) do
      utf8_reject() -> false
      state -> valid_utf8?(rest, state)
    end
  end

  def valid?(<<>>), do: true

  def valid_utf8?(<<byte, rest::binary>>, state) do
    type = elem(utf8t(), byte)

    case elem(utf8s(), state + type - 1) do
      utf8_accept() -> valid_ascii?(rest)
      utf8_reject() -> false
      state -> valid_utf8?(rest, state)
    end
  end

  def valid_utf8?(<<>>, utf8_accept()), do: true

  def valid_utf8?(_, _), do: false

  def valid_ascii?(binary) do
    case binary do
      <<b1, b2, b3, b4, b5, b6, b7, b8, rest::binary>>
      when are_all_ascii_plain(b1, b2, b3, b4, b5, b6, b7, b8) ->
        valid_ascii?(rest)

      other ->
        valid?(other)
    end
  end
end
New.valid?("a") == true and
  New.valid?("ø") == true and
  New.valid?(<<0xFFFF::16>>) == false and
  New.valid?(<<0xEF, 0xB7, 0x90>>) == true and
  New.valid?("asd" <> <<0xFFFF::16>>) == false
Benchee.run(
  %{
    "Native" => &String.valid?/1,
    "Lookup" => &New.valid?/1,
    "cowboy" => &Cowboy.valid?/1,
    "AndOr" => &BulkMatchBandBor.valid?/1,
    "AndOr2" => &BulkMatchBandBor2.valid?/1,
    "AndOr3" => &BulkMatchBandBor3.valid?/1,
    "32" => &BulkMatch32.valid?/1
  },
  time: 2,
  memory_time: 2,
  inputs: %{
    "1byte" => String.duplicate("$", 10_000),
    "2byte" => String.duplicate("£", 10_000),
    "3byte" => String.duplicate("€", 10_000),
    "4byte" => String.duplicate("𐍈", 10_000),
    "mixed" => String.duplicate("$£€𐍈", 10_000),
    "mixed rev" => String.duplicate("𐍈€£$", 10_000),
    "less than 64 bytes" => String.duplicate("$", 63)
  }
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment