TakahikoKawasaki/gist:8156948

## gistfile1.html
<!DOCTYPE html>
<html>
<!--
 * Copyright (C) 2013 Neo Visionaries Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
-->
<head>
<meta http-equiv="Content-Type" value="text/html;charset=UTF-8"/>
<meta name="author" content="Takahiko Kawasaki">
<title>Count up letters, bytes in UTF-8 and surrogate pairs in JavaScript</title>
<script type="text/javascript">
function compute_bytes_in_utf8(codePoint)
{
  // Unicode code points and their corresponding values
  // encoded in UTF-16BE are identical except code points
  // that are higher than U+FFFF.

  if (codePoint <= 0x007F)
  {
    // U+0000 - U+007F: 1 bytes in UTF-8.
    return 1;
  }
  else if (codePoint <= 0x07FF)
  {
    // U+0080 - U+07FF: 2 bytes in UTF-8.
    return 2;
  }
  else if (codePoint <= 0xD7FF)
  {
    // U+0800 - U+D7FF: 3 bytes in UTF-8.
    return 3;
  }
  else if (codePoint <= 0xDFFF)
  {
    // 0xD800 - 0xDBFF: High surrogates.
    // 0xDC00 - 0xDFFF: Low surrogates.
    //
    // The range represented by surrogate pairs is
    // U+10000 - U+10FFFF, and characters in the range
    // consume 4 bytes in UTF-8. Luckily, 4 can be divided
    // by 2 (2 here means 1 high surrogate + 1 low surrogate),
    // so this implementation returns 2 (= 4 / 2) here.
    return 2;
  }
  else if (codePoint <= 0xFFFF)
  {
    // U+E000 - U+FFFF: 3 bytes in UTF-8.
    return 3;
  }
  else
  {
    // U+10000 - ...: This won't happen in UTF-16.
    return 0;
  }
}

function count_up()
{
  // HTML elements for input and output.
  var input         = document.getElementById("input").value;
  var outputLetters = document.getElementById("outputLetters");
  var outputBytes   = document.getElementById("outputBytes")
  var outputPairs   = document.getElementById("outputPairs")

  // Counters for letters, bytes in UTF-8 and surrogate pairs.
  var nLetters = 0;
  var nBytes   = 0;
  var nPairs   = 0;

  // For each code points in the input string.
  for (var i = 0; i < input.length; ++i)
  {
    // Get the code point of the character at the position.
    //
    // Note that charCodeAt() always returns a value that is
    // less than 65,536. Higher code points (= U+10000 and
    // higher) are represented by surrogate pairs.
    var codePoint = input.charCodeAt(i);

    // If the code point is not in the range of low surrogates.
    if (codePoint <= 0xDBFF || 0xE000 <= codePoint)
    {
      // Count up the number of letters.
      ++nLetters;
    }

    // If the code point is in the range of high surrogates.
    if (0xD800 <= codePoint && codePoint <= 0xDBFF)
    {
      // Count up the number of surrogate pairs.
      ++nPairs;
    }

    // Compute the number of bytes when the code point is
    // encoded in UTF-8.
    nBytes += compute_bytes_in_utf8(codePoint);
  }

  // Write results.
  outputLetters.innerHTML = nLetters;
  outputBytes.innerHTML   = nBytes;
  outputPairs.innerHTML   = nPairs;
}
</script>
<body>
  <!-- Input -->
  <input id="input" type="text" onInput="count_up()"><br/>

  <!-- Output: Number of letters-->
  <span id="outputLetters">0</span> letter(s).<br/>

  <!-- Output: Number of bytes in UTF-8 -->
  <span id="outputBytes">0</span> byte(s) in UTF-8.<br/>

  <!-- Output: Number of surrogate pairs -->
  <span id="outputPairs">0</span> surrogate pair(s).<br/>
</body>
</html>
	<!DOCTYPE html>
	<html>
	<!--
	* Copyright (C) 2013 Neo Visionaries Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	-->
	<head>
	<meta http-equiv="Content-Type" value="text/html;charset=UTF-8"/>
	<meta name="author" content="Takahiko Kawasaki">
	<title>Count up letters, bytes in UTF-8 and surrogate pairs in JavaScript</title>
	<script type="text/javascript">
	function compute_bytes_in_utf8(codePoint)
	{
	// Unicode code points and their corresponding values
	// encoded in UTF-16BE are identical except code points
	// that are higher than U+FFFF.

	if (codePoint <= 0x007F)
	{
	// U+0000 - U+007F: 1 bytes in UTF-8.
	return 1;
	}
	else if (codePoint <= 0x07FF)
	{
	// U+0080 - U+07FF: 2 bytes in UTF-8.
	return 2;
	}
	else if (codePoint <= 0xD7FF)
	{
	// U+0800 - U+D7FF: 3 bytes in UTF-8.
	return 3;
	}
	else if (codePoint <= 0xDFFF)
	{
	// 0xD800 - 0xDBFF: High surrogates.
	// 0xDC00 - 0xDFFF: Low surrogates.
	//
	// The range represented by surrogate pairs is
	// U+10000 - U+10FFFF, and characters in the range
	// consume 4 bytes in UTF-8. Luckily, 4 can be divided
	// by 2 (2 here means 1 high surrogate + 1 low surrogate),
	// so this implementation returns 2 (= 4 / 2) here.
	return 2;
	}
	else if (codePoint <= 0xFFFF)
	{
	// U+E000 - U+FFFF: 3 bytes in UTF-8.
	return 3;
	}
	else
	{
	// U+10000 - ...: This won't happen in UTF-16.
	return 0;
	}
	}

	function count_up()
	{
	// HTML elements for input and output.
	var input = document.getElementById("input").value;
	var outputLetters = document.getElementById("outputLetters");
	var outputBytes = document.getElementById("outputBytes")
	var outputPairs = document.getElementById("outputPairs")

	// Counters for letters, bytes in UTF-8 and surrogate pairs.
	var nLetters = 0;
	var nBytes = 0;
	var nPairs = 0;

	// For each code points in the input string.
	for (var i = 0; i < input.length; ++i)
	{
	// Get the code point of the character at the position.
	//
	// Note that charCodeAt() always returns a value that is
	// less than 65,536. Higher code points (= U+10000 and
	// higher) are represented by surrogate pairs.
	var codePoint = input.charCodeAt(i);

	// If the code point is not in the range of low surrogates.
	if (codePoint <= 0xDBFF \|\| 0xE000 <= codePoint)
	{
	// Count up the number of letters.
	++nLetters;
	}

	// If the code point is in the range of high surrogates.
	if (0xD800 <= codePoint && codePoint <= 0xDBFF)
	{
	// Count up the number of surrogate pairs.
	++nPairs;
	}

	// Compute the number of bytes when the code point is
	// encoded in UTF-8.
	nBytes += compute_bytes_in_utf8(codePoint);
	}

	// Write results.
	outputLetters.innerHTML = nLetters;
	outputBytes.innerHTML = nBytes;
	outputPairs.innerHTML = nPairs;
	}
	</script>
	<body>
	<!-- Input -->
	<input id="input" type="text" onInput="count_up()"><br/>

	<!-- Output: Number of letters-->
	<span id="outputLetters">0</span> letter(s).<br/>

	<!-- Output: Number of bytes in UTF-8 -->
	<span id="outputBytes">0</span> byte(s) in UTF-8.<br/>

	<!-- Output: Number of surrogate pairs -->
	<span id="outputPairs">0</span> surrogate pair(s).<br/>
	</body>
	</html>