palianytsia/CharByExample.java

## CharByExample.java
/**
 * <p>
 * This class contains examples that help to understand the <code>char</code>
 * type together with the <strong>Unicode encoding scheme</strong>. The explanations
 * and examples are based on "The char Type" and "Code Points and Code Units" chapters
 * of "Core Java" book (Volume 1, 8th edition) by Cay Horstmann and Gary Cornell.
 * </p>
 * <p>
 * Before proceeding to examples, read the following terminology:
 * </p>
 * <p>
 * A <em>code point</em> is a code value that is associated with a character in
 * an encoding scheme. In the Unicode standard, code points are written in
 * hexadecimal and prefixed with U+, such as U+0041 for the code point of the
 * letter A. Unicode has code points that are grouped into 17
 * <em>code planes</em>. The first code plane, called the
 * <em>basic multilingual plane</em>, consists of the “classic” Unicode
 * characters with code points U+0000 to U+FFFF. Sixteen additional planes, with
 * code points U+10000 to U+10FFFF, hold the <em>supplementary characters</em>.
 * </p>
 * <p>
 * The UTF-16 encoding is a method of representing all Unicode code points in a
 * variable-length code. The characters in the basic multilingual plane are
 * represented as 16-bit values, called <em>code units</em>. The supplementary
 * characters are encoded as consecutive pairs of code units. Each of the values
 * in such an encoding pair falls into a range of 2048 unused values of the
 * basic multilingual plane, called the <em>surrogates area</em> (U+D800 to
 * U+DBFF for the first code unit, U+DC00 to U+DFFF for the second code unit).
 * This is rather clever, because you can immediately tell whether a code unit
 * encodes a single character or whether it is the first or second part of a
 * supplementary character.
 * </p>
 *
 * @author Ivan Palianytsia
 */
public class CharByExample
{

	public static void main(String[] args)
	{
		example1();
		example2();
		example3();
		example4();
	}

	/**
	 * Example 1. Unicode code units can be expressed as hexadecimal values that
	 * run from \u0000 to \uFFFF. For example, \u2122 is the trademark symbol
	 * (TM) and \u03C0 is the Greek letter pi (π).
	 */
	private static void example1()
	{
		System.out.println("***** Example 1 *****");
		System.out.println("\\u2122 is the trademark symbol \u2122");
		System.out.println("And \\u03C0 is is the Greek letter pi (\u03C0)");
		System.out.println();
	}

	/**
	 * Example 2. The mathematical symbol for the set of integers Zet has code
	 * point U+1D56B and is encoded by the two code units U+D835 and U+DD6B.
	 */
	private static void example2()
	{
		System.out.println("***** Example 2 *****");
		String z = "\uD835\uDD6B";
		System.out.println(z + " has code point U+"
			+ Integer.toHexString(z.codePointAt(0)).toUpperCase());
		System.out.println("And is encoded by the two code units U+D835 and U+DD6B");
		System.out.println();
	}

	/**
	 * Example 3. The length method yields the number of code units required for
	 * a given string in the UTF-16 encoding. To get the true length, that is,
	 * the number of code points use <code>codePointCount</code> method of the
	 * <code>String</code> class.
	 */
	private static void example3()
	{
		System.out.println("***** Example 3 *****");
		String string = "\uD835\uDD6B is the set of integers";
		System.out.println("String \"" + string + "\" has length " + string.length());
		System.out.println("However actual number of code points (symbols) in the string is "
			+ string.codePointCount(0, string.length()));
		System.out.println();
	}

	/**
	 * The call <code>s.charAt(n)</code> returns the code unit at position n,
	 * where n is between 0 and <code>s.length()</code> – 1. To get at the ith
	 * code point, use the statements
	 *
	 * <pre>
	 * int index = greeting.offsetByCodePoints(0, i);
	 * int cp = greeting.codePointAt(index);
	 * </pre>
	 */
	private static void example4()
	{
		System.out.println("***** Example 4 *****");
		String stringA = "Hello";
		String stringB = "\uD835\uDD6B is the set of integers";
		System.out.println("Call to charAt(1) on the String \"" + stringA
			+ "\" returns the second character - " + stringA.charAt(1));
		System.out.println("However the same call on the String \"" + stringB
			+ "\" doesn't return a space but the second code unit of " + "\uD835\uDD6B" + " - "
			+ stringB.charAt(1));
		System.out.println("To avoid this problem, you should not use the char "
			+ "type. It is too low-level.");
		System.out.println("Right way to traverse a string is to look at each code point in turn:");
		int i = 0;
		while (i < stringB.length())
		{
			int cp = stringB.codePointAt(i);
			printCodePoint(cp);
			if (Character.isSupplementaryCodePoint(cp))
			{
				i += 2;
			}
			else
			{
				i++;
			}

		}
		System.out.println();
		System.out.println("Unfortunately, the codePointAt method can't tell whether a code unit"
			+ " is the first or second half of a supplementary character.\nIt returns the right"
			+ " result only on the first half of a supplementary character:");
		for(int j = 0; j < 2; j++) {
			int cp = "\uD835\uDD6B".codePointAt(j);
			printCodePoint (cp);
			System.out.println(" (" + cp + ")");
		}
	}

	/**
	* Converts integer code of the code point to character and outputs it to the screen in square
	* brackets.
	*/
	private static void printCodePoint(int cp)
	{
		System.out.print("[");
		System.out.print(Character.toChars(cp));
		System.out.print("]");
	}
}
	/**
	* <p>
	* This class contains examples that help to understand the <code>char</code>
	* type together with the <strong>Unicode encoding scheme</strong>. The explanations
	* and examples are based on "The char Type" and "Code Points and Code Units" chapters
	* of "Core Java" book (Volume 1, 8th edition) by Cay Horstmann and Gary Cornell.
	* </p>
	* <p>
	* Before proceeding to examples, read the following terminology:
	* </p>
	* <p>
	* A <em>code point</em> is a code value that is associated with a character in
	* an encoding scheme. In the Unicode standard, code points are written in
	* hexadecimal and prefixed with U+, such as U+0041 for the code point of the
	* letter A. Unicode has code points that are grouped into 17
	* <em>code planes</em>. The first code plane, called the
	* <em>basic multilingual plane</em>, consists of the “classic” Unicode
	* characters with code points U+0000 to U+FFFF. Sixteen additional planes, with
	* code points U+10000 to U+10FFFF, hold the <em>supplementary characters</em>.
	* </p>
	* <p>
	* The UTF-16 encoding is a method of representing all Unicode code points in a
	* variable-length code. The characters in the basic multilingual plane are
	* represented as 16-bit values, called <em>code units</em>. The supplementary
	* characters are encoded as consecutive pairs of code units. Each of the values
	* in such an encoding pair falls into a range of 2048 unused values of the
	* basic multilingual plane, called the <em>surrogates area</em> (U+D800 to
	* U+DBFF for the first code unit, U+DC00 to U+DFFF for the second code unit).
	* This is rather clever, because you can immediately tell whether a code unit
	* encodes a single character or whether it is the first or second part of a
	* supplementary character.
	* </p>
	*
	* @author Ivan Palianytsia
	*/
	public class CharByExample
	{

	public static void main(String[] args)
	{
	example1();
	example2();
	example3();
	example4();
	}

	/**
	* Example 1. Unicode code units can be expressed as hexadecimal values that
	* run from \u0000 to \uFFFF. For example, \u2122 is the trademark symbol
	* (TM) and \u03C0 is the Greek letter pi (π).
	*/
	private static void example1()
	{
	System.out.println("*** Example 1 ***");
	System.out.println("\\u2122 is the trademark symbol \u2122");
	System.out.println("And \\u03C0 is is the Greek letter pi (\u03C0)");
	System.out.println();
	}

	/**
	* Example 2. The mathematical symbol for the set of integers Zet has code
	* point U+1D56B and is encoded by the two code units U+D835 and U+DD6B.
	*/
	private static void example2()
	{
	System.out.println("*** Example 2 ***");
	String z = "\uD835\uDD6B";
	System.out.println(z + " has code point U+"
	+ Integer.toHexString(z.codePointAt(0)).toUpperCase());
	System.out.println("And is encoded by the two code units U+D835 and U+DD6B");
	System.out.println();
	}

	/**
	* Example 3. The length method yields the number of code units required for
	* a given string in the UTF-16 encoding. To get the true length, that is,
	* the number of code points use <code>codePointCount</code> method of the
	* <code>String</code> class.
	*/
	private static void example3()
	{
	System.out.println("*** Example 3 ***");
	String string = "\uD835\uDD6B is the set of integers";
	System.out.println("String \"" + string + "\" has length " + string.length());
	System.out.println("However actual number of code points (symbols) in the string is "
	+ string.codePointCount(0, string.length()));
	System.out.println();
	}

	/**
	* The call <code>s.charAt(n)</code> returns the code unit at position n,
	* where n is between 0 and <code>s.length()</code> – 1. To get at the ith
	* code point, use the statements
	*
	* <pre>
	* int index = greeting.offsetByCodePoints(0, i);
	* int cp = greeting.codePointAt(index);
	* </pre>
	*/
	private static void example4()
	{
	System.out.println("*** Example 4 ***");
	String stringA = "Hello";
	String stringB = "\uD835\uDD6B is the set of integers";
	System.out.println("Call to charAt(1) on the String \"" + stringA
	+ "\" returns the second character - " + stringA.charAt(1));
	System.out.println("However the same call on the String \"" + stringB
	+ "\" doesn't return a space but the second code unit of " + "\uD835\uDD6B" + " - "
	+ stringB.charAt(1));
	System.out.println("To avoid this problem, you should not use the char "
	+ "type. It is too low-level.");
	System.out.println("Right way to traverse a string is to look at each code point in turn:");
	int i = 0;
	while (i < stringB.length())
	{
	int cp = stringB.codePointAt(i);
	printCodePoint(cp);
	if (Character.isSupplementaryCodePoint(cp))
	{
	i += 2;
	}
	else
	{
	i++;
	}

	}
	System.out.println();
	System.out.println("Unfortunately, the codePointAt method can't tell whether a code unit"
	+ " is the first or second half of a supplementary character.\nIt returns the right"
	+ " result only on the first half of a supplementary character:");
	for(int j = 0; j < 2; j++) {
	int cp = "\uD835\uDD6B".codePointAt(j);
	printCodePoint (cp);
	System.out.println(" (" + cp + ")");
	}
	}

	/**
	* Converts integer code of the code point to character and outputs it to the screen in square
	* brackets.
	*/
	private static void printCodePoint(int cp)
	{
	System.out.print("[");
	System.out.print(Character.toChars(cp));
	System.out.print("]");
	}
	}