Skip to content

Instantly share code, notes, and snippets.

@asakura
Last active March 18, 2024 09:47
Show Gist options
  • Save asakura/71d8f754403dda46051163d6b7b5b741 to your computer and use it in GitHub Desktop.
Save asakura/71d8f754403dda46051163d6b7b5b741 to your computer and use it in GitHub Desktop.

Is UTF8 string valid?

Mix.install([:benchee, :benchee_markdown])

Section

defmodule BulkMatch32 do
  defguardp is_inner_byte(c) when c >= 128 and c < 192

  def valid?(str), do: do_valid?(str)

  defp do_valid?(<<a::32, rest::binary>>) when Bitwise.band(a, 0x80808080) == 0,
    do: do_valid?(rest)

  defp do_valid?(<<a::8, rest::binary>>) when a < 128, do: do_valid?(rest)

  defp do_valid?(<<a::8, b::8, rest::binary>>) when a >= 192 and a < 224 and is_inner_byte(b),
    do: do_valid?(rest)

  defp do_valid?(<<a::8, b::8, c::8, rest::binary>>)
       when a >= 224 and a < 240 and is_inner_byte(b) and is_inner_byte(c),
       do: do_valid?(rest)

  defp do_valid?(<<a::8, b::8, c::8, d::8, rest::binary>>)
       when a >= 240 and is_inner_byte(b) and is_inner_byte(c) and is_inner_byte(d),
       do: do_valid?(rest)

  defp do_valid?(<<>>), do: true
  defp do_valid?(_str), do: false
end
defmodule BulkMatchBandBor do
  def valid?(<<a::8, rest::binary>>) when a < 128,
    do: valid?(rest)

  def valid?(<<a::16, rest::binary>>)
      when Bitwise.band(a, 0b1100000010000000) == 49280 and
             Bitwise.bor(a, 0b1101111110111111) == 57279,
      do: valid?(rest)

  def valid?(<<a::24, rest::binary>>)
      when Bitwise.band(a, 0b111000001000000010000000) == 14_712_960 and
             Bitwise.bor(a, 0b111011111011111110111111) == 15_712_191,
      do: valid?(rest)

  def valid?(<<a::32, rest::binary>>) do
    cond do
      Bitwise.band(a, 0b11110000100000001000000010000000) == 4_034_953_344 and
          Bitwise.bor(a, 0b11110111101111111011111110111111) == 4_156_538_815 ->
        valid?(rest)

      Bitwise.band(a, 0x80808080) == 0 ->
        valid?(rest)

      :otherwise ->
        false
    end
  end

  def valid?(<<>>), do: true
  def valid?(_str), do: false
end

BulkMatchBandBor.valid?("a") == true and
  BulkMatchBandBor.valid?("ø") == true and
  BulkMatchBandBor.valid?(<<0xFFFF::16>>) == false and
  BulkMatchBandBor.valid?(<<0xEF, 0xB7, 0x90>>) == true and
  BulkMatchBandBor.valid?("asd" <> <<0xFFFF::16>>) == false and
  BulkMatchBandBor.valid?("$") and
  BulkMatchBandBor.valid?("£") and
  BulkMatchBandBor.valid?("€") and
  BulkMatchBandBor.valid?("𐍈")
defmodule BulkMatchBandBor2 do
  def valid?(<<a::8, rest::binary>>) do
    cond do
      a < 128 -> valid?(rest)
      a > 191 and a < 224 -> valid?(:two, rest)
      a > 223 and a < 240 -> valid?(:three, rest)
      a > 239 -> valid?(:four, rest)
    end
  end

  def valid?(<<>>), do: true
  def valid?(_str), do: false

  @compile {:inline, valid?: 2}

  def valid?(:two, <<a::8, rest::binary>>)
      when Bitwise.band(a, 0b10000000) == 128 and
             Bitwise.bor(a, 0b10111111) == 191,
      do: valid?(rest)

  def valid?(:three, <<a::16, rest::binary>>)
      when Bitwise.band(a, 0b1000000010000000) == 32896 and
             Bitwise.bor(a, 0b1011111110111111) == 49087,
      do: valid?(rest)

  def valid?(:four, <<a::24, rest::binary>>)
      when Bitwise.band(a, 0b100000001000000010000000) == 8_421_504 and
             Bitwise.bor(a, 0b101111111011111110111111) == 12_566_463,
      do: valid?(rest)

  def valid?(_, _), do: false
end

BulkMatchBandBor2.valid?("a") == true and
  BulkMatchBandBor2.valid?("ø") == true and
  BulkMatchBandBor2.valid?(<<0xFFFF::16>>) == false and
  BulkMatchBandBor2.valid?(<<0xEF, 0xB7, 0x90>>) == true and
  BulkMatchBandBor2.valid?("asd" <> <<0xFFFF::16>>) == false and
  BulkMatchBandBor2.valid?("$") and
  BulkMatchBandBor2.valid?("£") and
  BulkMatchBandBor2.valid?("€") and
  BulkMatchBandBor2.valid?("𐍈")
defmodule BulkMatchBandBor3 do
  def valid?(<<a::8, rest::binary>>) do
    cond do
      a < 128 ->
        valid_one?(rest)

      a > 191 and a < 224 ->
        valid_two?(rest)

      a > 223 and a < 240 ->
        valid_three?(rest)

      a > 239 ->
        valid_four?(rest)
    end
  end

  def valid?(<<>>), do: true
  def valid?(_str), do: false

  @compile {:inline, valid_two?: 1}

  def valid_two?(<<a::8, rest::binary>>)
      when a < 192,
      do: valid?(rest)

  def valid_two?(_),
    do: false

  @compile {:inline, valid_three?: 1}

  def valid_three?(<<a::16, rest::binary>>)
      when Bitwise.band(a, 0x8080) == 0x8080 and
             Bitwise.bor(a, 0xBFBF) == 0xBFBF,
      do: valid?(rest)

  def valid_three?(_),
    do: false

  @compile {:inline, valid_four?: 1}

  def valid_four?(<<a::24, rest::binary>>)
      when Bitwise.band(a, 0x808080) == 0x808080 and
             Bitwise.bor(a, 0xBFBFBF) == 0xBFBFBF,
      do: valid?(rest)

  def valid_four?(_),
    do: false

  @compile {:inline, valid_one?: 1}
  def valid_one?(<<a::56, rest::binary>>)
      when Bitwise.band(a, 0x80808080808080) == 0,
      do: valid?(rest)

  def valid_one?(rest),
    do: valid?(rest)
end

BulkMatchBandBor3.valid?("a") == true and
  BulkMatchBandBor3.valid?("ø") == true and
  BulkMatchBandBor3.valid?(<<0xFFFF::16>>) == false and
  BulkMatchBandBor3.valid?(<<0xEF, 0xB7, 0x90>>) == true and
  BulkMatchBandBor3.valid?("asd" <> <<0xFFFF::16>>) == false and
  BulkMatchBandBor3.valid?("$") and
  BulkMatchBandBor3.valid?("£") and
  BulkMatchBandBor3.valid?("€") and
  BulkMatchBandBor3.valid?("𐍈")
defmodule AndOr4 do
  def valid?(<<>>), do: true
  def valid?(str) when not is_binary(str), do: false
  def valid?(str), do: do_valid?(str)

  def do_valid?(<<a::56, rest::binary>>)
      when Bitwise.band(a, 0x80808080808080) == 0,
      do: do_valid?(rest)

  def do_valid?(<<a::8, rest::binary>>) do
    rest =
      cond do
        a <= 127 ->
          rest

        a >= 194 and a <= 223 ->
          case rest do
            <<a::8, rest::binary>> when a < 192 -> rest
            _ -> false
          end

        a >= 224 and a <= 239 ->
          case rest do
            <<b::16, rest::binary>>
            when Bitwise.band(b, 0x8080) == 0x8080 and
                   Bitwise.bor(b, 0xBFBF) == 0xBFBF ->
              cond do
                a == 224 ->
                  c = Bitwise.bsr(a, 8)

                  cond do
                    c >= 128 and c <= 159 -> false
                    :otherwise -> rest
                  end

                a == 237 ->
                  c = Bitwise.bsr(a, 8)

                  cond do
                    c >= 160 and c <= 191 -> false
                    :otherwise -> rest
                  end

                (a >= 225 and a <= 236) or a == 238 or a == 239 ->
                  rest

                :otherwise ->
                  false
              end

            _ ->
              false
          end

        a >= 240 ->
          case rest do
            <<b::24, rest::binary>>
            when Bitwise.band(b, 0x808080) == 0x808080 and
                   Bitwise.bor(b, 0xBFBFBF) == 0xBFBFBF ->
              cond do
                a == 240 ->
                  c = Bitwise.bsr(b, 16)

                  cond do
                    c >= 128 and c <= 143 -> false
                    :otherwise -> rest
                  end

                a >= 241 and a <= 243 ->
                  rest

                a == 244 ->
                  c = Bitwise.bsr(b, 16)

                  cond do
                    c >= 160 and c <= 191 -> false
                    :otherwise -> rest
                  end

                :otherwise ->
                  false
              end

            _ ->
              false
          end

        :otherwise ->
          false
      end

    do_valid?(rest)
  end

  def do_valid?(<<>>), do: true
  def do_valid?(false), do: false
end

AndOr4.valid?("a") == true and
  AndOr4.valid?("ø") == true and
  AndOr4.valid?(<<0xFFFF::16>>) == false and
  AndOr4.valid?(<<0xEF, 0xB7, 0x90>>) == true and
  AndOr4.valid?("asd" <> <<0xFFFF::16>>) == false and
  AndOr4.valid?("$") and
  AndOr4.valid?("£") and
  AndOr4.valid?("€") and
  AndOr4.valid?("𐍈")

# AndOr4.valid?(<<0b11100000, 0b10000000, 0b10000000>>)
defmodule Cowboy do
  def valid?(str), do: validate_utf8(str, 0) == 0

  defp validate_utf8(<<>>, state), do: state
  defp validate_utf8(<<c::8, rest::bits>>, 0) when c < 128, do: validate_utf8(rest, 0)

  defp validate_utf8(<<c::8, rest::bits>>, 2) when c >= 128 and c < 144,
    do: validate_utf8(rest, 0)

  defp validate_utf8(<<c::8, rest::bits>>, 3) when c >= 128 and c < 144,
    do: validate_utf8(rest, 2)

  defp validate_utf8(<<c::8, rest::bits>>, 5) when c >= 128 and c < 144,
    do: validate_utf8(rest, 2)

  defp validate_utf8(<<c::8, rest::bits>>, 7) when c >= 128 and c < 144,
    do: validate_utf8(rest, 3)

  defp validate_utf8(<<c::8, rest::bits>>, 8) when c >= 128 and c < 144,
    do: validate_utf8(rest, 3)

  defp validate_utf8(<<c::8, rest::bits>>, 2) when c >= 144 and c < 160,
    do: validate_utf8(rest, 0)

  defp validate_utf8(<<c::8, rest::bits>>, 3) when c >= 144 and c < 160,
    do: validate_utf8(rest, 2)

  defp validate_utf8(<<c::8, rest::bits>>, 5) when c >= 144 and c < 160,
    do: validate_utf8(rest, 2)

  defp validate_utf8(<<c::8, rest::bits>>, 6) when c >= 144 and c < 160,
    do: validate_utf8(rest, 3)

  defp validate_utf8(<<c::8, rest::bits>>, 7) when c >= 144 and c < 160,
    do: validate_utf8(rest, 3)

  defp validate_utf8(<<c::8, rest::bits>>, 2) when c >= 160 and c < 192,
    do: validate_utf8(rest, 0)

  defp validate_utf8(<<c::8, rest::bits>>, 3) when c >= 160 and c < 192,
    do: validate_utf8(rest, 2)

  defp validate_utf8(<<c::8, rest::bits>>, 4) when c >= 160 and c < 192,
    do: validate_utf8(rest, 2)

  defp validate_utf8(<<c::8, rest::bits>>, 6) when c >= 160 and c < 192,
    do: validate_utf8(rest, 3)

  defp validate_utf8(<<c::8, rest::bits>>, 7) when c >= 160 and c < 192,
    do: validate_utf8(rest, 3)

  defp validate_utf8(<<c::8, rest::bits>>, 0) when c >= 194 and c < 224,
    do: validate_utf8(rest, 2)

  defp validate_utf8(<<224::8, rest::bits>>, 0), do: validate_utf8(rest, 4)

  defp validate_utf8(<<c::8, rest::bits>>, 0) when c >= 225 and c < 237,
    do: validate_utf8(rest, 3)

  defp validate_utf8(<<237::8, rest::bits>>, 0), do: validate_utf8(rest, 5)

  defp validate_utf8(<<c::8, rest::bits>>, 0) when c == 238 and c == 239,
    do: validate_utf8(rest, 3)

  defp validate_utf8(<<240::8, rest::bits>>, 0), do: validate_utf8(rest, 6)

  defp validate_utf8(<<c::8, rest::bits>>, 0) when c == 241 and c == 242 and c == 243,
    do: validate_utf8(rest, 7)

  defp validate_utf8(<<244::8, rest::bits>>, 0), do: validate_utf8(rest, 8)
  defp validate_utf8(_, _), do: 1
end
defmodule New.Macro do
  defmacro utf8_accept() do
    quote do
      0
    end
  end

  defmacro utf8_reject() do
    quote do
      12
    end
  end

  defguard are_all_ascii_plain(b1, b2, b3, b4, b5, b6, b7, b8)
           when b1 <= 127 and
                  b2 <= 127 and
                  b3 <= 127 and
                  b4 <= 127 and
                  b5 <= 127 and
                  b6 <= 127 and
                  b7 <= 127 and
                  b8 <= 127

  # This is an adapted table from "Flexible and Economical UTF-8 Decoding" by Bjoern Hoehrmann.
  # http://bjoern.hoehrmann.de/utf-8/decoder/dfa/

  # Map character to character class
  defmacro utf8t() do
    quote do
      {
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        9,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        7,
        8,
        8,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        10,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        3,
        4,
        3,
        3,
        11,
        6,
        6,
        6,
        5,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8,
        8
      }
    end
  end

  # Transition table mapping combination of state & class to next state
  defmacro utf8s() do
    quote do
      {
        12,
        24,
        36,
        60,
        96,
        84,
        12,
        12,
        12,
        48,
        72,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        0,
        12,
        12,
        12,
        12,
        12,
        0,
        12,
        0,
        12,
        12,
        12,
        24,
        12,
        12,
        12,
        12,
        12,
        24,
        12,
        24,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        24,
        12,
        12,
        12,
        12,
        12,
        24,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        24,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        36,
        12,
        36,
        12,
        12,
        12,
        36,
        12,
        12,
        12,
        12,
        12,
        36,
        12,
        36,
        12,
        12,
        12,
        36,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12
      }
    end
  end

  # Optimisation for 1st byte direct state lookup,
  # we know starting state is 0 and ASCII bytes were already handled
  defmacro utf8s0() do
    quote do
      {
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        24,
        48,
        36,
        36,
        36,
        36,
        36,
        36,
        36,
        36,
        36,
        36,
        36,
        36,
        60,
        36,
        36,
        72,
        84,
        84,
        84,
        96,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12,
        12
      }
    end
  end
end
defmodule New do
  import New.Macro

  def valid?(<<byte, rest::bits>>) when byte <= 127,
    do: valid?(rest)

  def valid?(<<byte, rest::bytes>>) do
    case elem(utf8s0(), byte - 127 - 1) do
      utf8_reject() -> false
      state -> valid_utf8?(rest, state)
    end
  end

  def valid?(<<>>), do: true

  def valid_utf8?(<<byte, rest::binary>>, state) do
    type = elem(utf8t(), byte)

    case elem(utf8s(), state + type - 1) do
      utf8_accept() -> valid_ascii?(rest)
      utf8_reject() -> false
      state -> valid_utf8?(rest, state)
    end
  end

  def valid_utf8?(<<>>, utf8_accept()), do: true

  def valid_utf8?(_, _), do: false

  def valid_ascii?(binary) do
    case binary do
      <<b1, b2, b3, b4, b5, b6, b7, b8, rest::binary>>
      when are_all_ascii_plain(b1, b2, b3, b4, b5, b6, b7, b8) ->
        valid_ascii?(rest)

      other ->
        valid?(other)
    end
  end
end
New.valid?("a") == true and
  New.valid?("ø") == true and
  New.valid?(<<0xFFFF::16>>) == false and
  New.valid?(<<0xEF, 0xB7, 0x90>>) == true and
  New.valid?("asd" <> <<0xFFFF::16>>) == false
defmodule Benchmark do
  def native(str), do: String.valid?(str)
  def native_fast_ascii(str), do: String.valid?(str, :fast_ascii)
  def lookup(str), do: New.valid?(str)
  def cowboy(str), do: Cowboy.valid?(str)
  def and_or(str), do: BulkMatchBandBor.valid?(str)
  def and_or2(str), do: BulkMatchBandBor2.valid?(str)
  def and_or3(str), do: BulkMatchBandBor3.valid?(str)
  def and_or4(str), do: AndOr4.valid?(str)
  def bulk_32(str), do: BulkMatch32.valid?(str)

  def run() do
    Benchee.run(
      %{
        # "Native" => &native/1,
        "Native :fast_ascii" => &native_fast_ascii/1,
        "Lookup" => &lookup/1,
        "Cowboy" => &cowboy/1,
        # "AndOr" => &and_or/1,
        # "AndOr2" => &and_or2/1,
        "AndOr3" => &and_or3/1,
        "AndOr4" => &and_or4/1
        # "Bulk32" => &bulk_32/1
      },
      time: 2,
      # memory_time: 2,
      inputs: %{
        "1byte" => String.duplicate("$", 10_000),
        # "2byte" => String.duplicate("£", 10_000),
        # "3byte" => String.duplicate("€", 10_000),
        # "4byte" => String.duplicate("𐍈", 10_000),
        "mixed" => String.duplicate("$£€𐍈", 10_000),
        # "mixed rev" => String.duplicate("𐍈€£$", 10_000),
        "less than 64 bytes" => String.duplicate("$", 63)
      }
    )
  end
end

Benchmark.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment