TakahikoKawasaki/gist:8159741

## gistfile1.html
<!DOCTYPE html>
<html>
<!--
 * Copyright (C) 2013 Neo Visionaries Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
-->
<head>
<meta http-equiv="Content-Type" value="text/html;charset=UTF-8"/>
<meta name="author" content="Takahiko Kawasaki">
<title>JavaScript で文字数、UTF-8 でのバイト数、サロゲートペアの数を数える</title>
<script type="text/javascript">
function compute_bytes_in_utf8(codePoint)
{
  // Unicode のコードポイントと、それを UTF-16BE で符号化した
  // ときの値は等しい。ただし、U+FFFF 以上のコードポイントは除く。

  if (codePoint <= 0x007F)
  {
    // U+0000 - U+007F: UTF-8 で 1 バイト
    return 1;
  }
  else if (codePoint <= 0x07FF)
  {
    // U+0080 - U+07FF: UTF-8 で 2 バイト
    return 2;
  }
  else if (codePoint <= 0xD7FF)
  {
    // U+0800 - U+D7FF: UTF-8 で 3 バイト
    return 3;
  }
  else if (codePoint <= 0xDFFF)
  {
    // 0xD800 - 0xDBFF: 上位サロゲート
    // 0xDC00 - 0xDFFF: 下位サロゲート
    //
    // サロゲートペアで表現される範囲は U+10000 ～ U+10FFFF で、
    // この範囲の文字は UTF-8 で符号化したとき 4 バイトとなる。
    // ちょうどいい具合に、4 は 2 で割り切れる (ここで 2 とは、
    // 上位サロゲート一つと下位サロゲート合わせて二つという意味)。
    // そのため、この実装でここで 2 (= 4 / 2) を返す。
    return 2;
  }
  else if (codePoint <= 0xFFFF)
  {
    // U+E000 - U+FFFF: UTF-8 で 3 バイト
    return 3;
  }
  else
  {
    // U+10000 - ...: UTF-16 ではここには来ない。
    return 0;
  }
}

function count_up()
{
  // 入力と出力に用いる HTML 要素
  var input         = document.getElementById("input").value;
  var outputLetters = document.getElementById("outputLetters");
  var outputBytes   = document.getElementById("outputBytes")
  var outputPairs   = document.getElementById("outputPairs")

  // 文字数、UTF-8 でのバイト数、サロゲートペア数のカウンター
  var nLetters = 0;
  var nBytes   = 0;
  var nPairs   = 0;

  // 入力文字列に含まれる各コードポイントごとに
  for (var i = 0; i < input.length; ++i)
  {
    // その位置にある文字のコードポイントを取得する。
    //
    // charCodeAt() は常に 65,536 より小さい値を返す。より大きい
    // コードポイント (= U+10000 以上) は、サロゲートペアを用いて
    // 表現される。
    var codePoint = input.charCodeAt(i);

    // 下位サロゲートの範囲でなければ
    if (codePoint <= 0xDBFF || 0xE000 <= codePoint)
    {
      // 文字数を数える
      ++nLetters;
    }

    // 上位サロゲートの範囲ならば
    if (0xD800 <= codePoint && codePoint <= 0xDBFF)
    {
      // サロゲートペア数を数える
      ++nPairs;
    }

    // UTF-8 で符号化したときのバイト数を計算する。
    nBytes += compute_bytes_in_utf8(codePoint);
  }

  // 結果を書く
  outputLetters.innerHTML = nLetters;
  outputBytes.innerHTML   = nBytes;
  outputPairs.innerHTML   = nPairs;
}
</script>
<body>
  <!-- 入力 -->
  <input id="input" type="text" onInput="count_up()"><br/><br/>

  <!-- 出力: 文字数 -->
  文字数： <span id="outputLetters">0</span><br/>

  <!-- 出力: UTF-8 でのバイト数 -->
  UTF-8 でのバイト数： <span id="outputBytes">0</span><br/>

  <!-- 出力: サロゲートペアの数 -->
  サロゲートペアの数： <span id="outputPairs">0</span><br/>
</body>
</html>
	<!DOCTYPE html>
	<html>
	<!--
	* Copyright (C) 2013 Neo Visionaries Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	-->
	<head>
	<meta http-equiv="Content-Type" value="text/html;charset=UTF-8"/>
	<meta name="author" content="Takahiko Kawasaki">
	<title>JavaScript で文字数、UTF-8 でのバイト数、サロゲートペアの数を数える</title>
	<script type="text/javascript">
	function compute_bytes_in_utf8(codePoint)
	{
	// Unicode のコードポイントと、それを UTF-16BE で符号化した
	// ときの値は等しい。ただし、U+FFFF 以上のコードポイントは除く。

	if (codePoint <= 0x007F)
	{
	// U+0000 - U+007F: UTF-8 で 1 バイト
	return 1;
	}
	else if (codePoint <= 0x07FF)
	{
	// U+0080 - U+07FF: UTF-8 で 2 バイト
	return 2;
	}
	else if (codePoint <= 0xD7FF)
	{
	// U+0800 - U+D7FF: UTF-8 で 3 バイト
	return 3;
	}
	else if (codePoint <= 0xDFFF)
	{
	// 0xD800 - 0xDBFF: 上位サロゲート
	// 0xDC00 - 0xDFFF: 下位サロゲート
	//
	// サロゲートペアで表現される範囲は U+10000 ～ U+10FFFF で、
	// この範囲の文字は UTF-8 で符号化したとき 4 バイトとなる。
	// ちょうどいい具合に、4 は 2 で割り切れる (ここで 2 とは、
	// 上位サロゲート一つと下位サロゲート合わせて二つという意味)。
	// そのため、この実装でここで 2 (= 4 / 2) を返す。
	return 2;
	}
	else if (codePoint <= 0xFFFF)
	{
	// U+E000 - U+FFFF: UTF-8 で 3 バイト
	return 3;
	}
	else
	{
	// U+10000 - ...: UTF-16 ではここには来ない。
	return 0;
	}
	}

	function count_up()
	{
	// 入力と出力に用いる HTML 要素
	var input = document.getElementById("input").value;
	var outputLetters = document.getElementById("outputLetters");
	var outputBytes = document.getElementById("outputBytes")
	var outputPairs = document.getElementById("outputPairs")

	// 文字数、UTF-8 でのバイト数、サロゲートペア数のカウンター
	var nLetters = 0;
	var nBytes = 0;
	var nPairs = 0;

	// 入力文字列に含まれる各コードポイントごとに
	for (var i = 0; i < input.length; ++i)
	{
	// その位置にある文字のコードポイントを取得する。
	//
	// charCodeAt() は常に 65,536 より小さい値を返す。より大きい
	// コードポイント (= U+10000 以上) は、サロゲートペアを用いて
	// 表現される。
	var codePoint = input.charCodeAt(i);

	// 下位サロゲートの範囲でなければ
	if (codePoint <= 0xDBFF \|\| 0xE000 <= codePoint)
	{
	// 文字数を数える
	++nLetters;
	}

	// 上位サロゲートの範囲ならば
	if (0xD800 <= codePoint && codePoint <= 0xDBFF)
	{
	// サロゲートペア数を数える
	++nPairs;
	}

	// UTF-8 で符号化したときのバイト数を計算する。
	nBytes += compute_bytes_in_utf8(codePoint);
	}

	// 結果を書く
	outputLetters.innerHTML = nLetters;
	outputBytes.innerHTML = nBytes;
	outputPairs.innerHTML = nPairs;
	}
	</script>
	<body>
	<!-- 入力 -->
	<input id="input" type="text" onInput="count_up()"><br/><br/>

	<!-- 出力: 文字数 -->
	文字数： <span id="outputLetters">0</span><br/>

	<!-- 出力: UTF-8 でのバイト数 -->
	UTF-8 でのバイト数： <span id="outputBytes">0</span><br/>

	<!-- 出力: サロゲートペアの数 -->
	サロゲートペアの数： <span id="outputPairs">0</span><br/>
	</body>
	</html>