Created
May 21, 2013 17:07
-
-
Save quoll/5621440 to your computer and use it in GitHub Desktop.
Provides codepoint support as an extension to what java.lang.Character can do
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package util; | |
import java.util.*; | |
/** | |
* This method fills in where java.lang.Character can't manage 21 bit Unicode. | |
* Represents Unicode Scalar Values, U+0000 to U+10FFFF | |
* | |
* @author Paul Gearon | |
*/ | |
public class CodePoint implements java.io.Serializable, Comparable<CodePoint> { | |
/** The serialization ID. */ | |
private static final long serialVersionUID = 3212781738993088980L; | |
/** The internal representation. */ | |
private final int data; | |
/** The maximum value of a Unicode code point. */ | |
public static final CodePoint MAX_VALUE = new CodePoint(Character.MAX_CODE_POINT); | |
/** The minimum value of a Unicode code point. */ | |
public static final CodePoint MIN_VALUE = new CodePoint(Character.MIN_CODE_POINT); | |
/** | |
* Constructs a new CodePoint for a given Character value. | |
* @param value The value to be represented by the CodePoint. | |
*/ | |
public CodePoint(Character value) { | |
data = value.charValue(); | |
} | |
/** | |
* Constructs a new CodePoint for a given char value. | |
* @param value The value to be represented by the CodePoint. | |
*/ | |
public CodePoint(char value) { | |
data = value; | |
} | |
/** | |
* Constructs a new CodePoint for a given Unicode scalar value. | |
* @param value The Unicode scalar value of this CodePoint. | |
*/ | |
public CodePoint(int value) { | |
if (!Character.isValidCodePoint(value)) throw new IllegalArgumentException("Code point out of range"); | |
data = value; | |
} | |
/** | |
* Constructs a new codepoint for a given char array. | |
* @param value The value to be represented by the codepoint in UTF-16. | |
*/ | |
public CodePoint(char[] value) { | |
data = Character.codePointAt(value, 0); | |
} | |
/** | |
* Constructs a new codepoint for a given Character array. | |
* @param value The value to be represented by the codepoint in UTF-16. | |
*/ | |
public CodePoint(Character[] value) { | |
char[] tmpValue; | |
if (Character.isHighSurrogate(value[0])) { | |
if (value.length < 2) throw new IllegalArgumentException("Malformed UniCode character array"); | |
tmpValue = new char[] { value[0], value[1] }; | |
} else { | |
tmpValue = new char[] { value[0] }; | |
} | |
data = Character.codePointAt(tmpValue, 0); | |
} | |
/** | |
* Constructs a new codepoint from an offset into a given char array. | |
* @param chars The UTF-16 array containing the CodePoints. | |
* @param offset The offset into the array to read from. | |
*/ | |
public CodePoint(char[] chars, int offset) { | |
data = Character.codePointAt(chars, offset); | |
} | |
/** | |
* Constructs a new codepoint for a given Character array. | |
* @param characters The UTF-16 array containing the CodePoints. | |
* @param offset The offset into the array to read from. | |
*/ | |
public CodePoint(Character[] characters, int offset) { | |
char[] tmpValue; | |
if (Character.isHighSurrogate(characters[0].charValue())) { | |
if (characters.length - offset < 2) throw new IllegalArgumentException("Malformed UniCode character array"); | |
tmpValue = new char[] { characters[offset], characters[offset + 1] }; | |
} else { | |
tmpValue = new char[] { characters[offset] }; | |
} | |
data = Character.codePointAt(tmpValue, 0); | |
} | |
/** | |
* Gets a Unicode scalar value for use with java.lang.Character methods. | |
* @return The UniCode scalar value of this CodePoint. | |
*/ | |
public int intValue() { | |
return data; | |
} | |
/** | |
* Convert this code point to its UTF-16 representation stored in a char array. | |
* @return An array containing the UTF-16 representation of the current code point. | |
*/ | |
public char[] toChars() { | |
return Character.toChars(data); | |
} | |
/** | |
* Convert this code point to its UTF-16 representation stored in a provided char array. | |
* @param dst An array of char in which the codePoint's UTF-16 value is stored. | |
* @param dstIndex The start index into the dst array where the converted value is stored. | |
* @return 1 if the code point is a BMP code point, 2 if the code point is a supplementary code point. | |
*/ | |
public int toChars(char[] dst, int dstIndex) { | |
return Character.toChars(data, dst, dstIndex); | |
} | |
/** | |
* Convert this code point to its UTF-16 representation stored in a Character array. | |
* @return An array containing the UTF-16 representation of the current code point. | |
*/ | |
public Character[] toCharacters() { | |
return toCharacters(Character.toChars(data)); | |
} | |
/** | |
* Convert this code point to its UTF-16 representation stored in a provided char array. | |
* @param dst An array of char in which the codePoint's UTF-16 value is stored. | |
* @param dstIndex The start index into the dst array where the converted value is stored. | |
* @return 1 if the code point is a BMP code point, 2 if the code point is a supplementary code point. | |
*/ | |
public int toCharacters(Character[] dst, int dstIndex) { | |
char[] tmpDst = new char[2]; | |
int size = Character.toChars(data, tmpDst, 0); | |
dst[dstIndex] = tmpDst[0]; | |
if (size == 2) dst[dstIndex + 1] = tmpDst[1]; | |
return size; | |
} | |
/** | |
* Tests for equality. | |
* @param o The object to compare to. | |
* @return <code>true</code> if o is equal to this object. | |
*/ | |
public boolean equals(Object o) { | |
return (o instanceof CodePoint) && ((CodePoint)o).data == data; | |
} | |
/** | |
* Gets a hashcode for this object. | |
* @return An integer hash code. | |
*/ | |
public int hashCode() { | |
return data; | |
} | |
/** | |
* Performs a comparison on another CodePoint object. | |
* @param c The CodePoint to compare to. | |
* @return A negative integer, zero, or a positive integer as this object is less than, equal to, | |
* or greater than the specified object. | |
*/ | |
public int compareTo(CodePoint c) { | |
// bit ranges guarantee that we can take data-c.data, but this is poor practice in general | |
if (data == c.data) return 0; | |
if (data < c.data) return -1; | |
return 1; | |
} | |
/** | |
* Returns a String object representing this codepoint's value. | |
* @return A String version of this unicode character. | |
*/ | |
public String toString() { | |
return new String(toChars()); | |
} | |
/** | |
* Determines if the specified codepoint is lowercase. | |
* <p> | |
* A codepoint is lowercase if its general category type, provided | |
* by <code>CodePoint.getType()</code>, is | |
* <code>LOWERCASE_LETTER</code>. | |
* <p> | |
* The following are examples of lowercase codepoints: | |
* <p><blockquote><pre> | |
* a b c d e f g h i j k l m n o p q r s t u v w x y z | |
* '\u00DF' '\u00E0' '\u00E1' '\u00E2' '\u00E3' '\u00E4' '\u00E5' '\u00E6' | |
* '\u00E7' '\u00E8' '\u00E9' '\u00EA' '\u00EB' '\u00EC' '\u00ED' '\u00EE' | |
* '\u00EF' '\u00F0' '\u00F1' '\u00F2' '\u00F3' '\u00F4' '\u00F5' '\u00F6' | |
* '\u00F8' '\u00F9' '\u00FA' '\u00FB' '\u00FC' '\u00FD' '\u00FE' '\u00FF' | |
* </pre></blockquote> | |
* <p> Many other Unicode codepoints are lowercase too. | |
* <p> | |
* | |
* @return <code>true</code> if the codepoint is lowercase; | |
* <code>false</code> otherwise. | |
*/ | |
public boolean isLowerCase() { | |
return Character.isLowerCase(data); | |
} | |
/** | |
* Determines if the codepoint is an uppercase codepoint. | |
* <p> | |
* A codepoint is uppercase if its general category type, provided by | |
* <code>sodePoint.getType()</code>, is <code>UPPERCASE_LETTER</code>. | |
* <p> | |
* The following are examples of uppercase codepoints: | |
* <p><blockquote><pre> | |
* A B C D E F G H I J K L M N O P Q R S T U V W X Y Z | |
* '\u00C0' '\u00C1' '\u00C2' '\u00C3' '\u00C4' '\u00C5' '\u00C6' '\u00C7' | |
* '\u00C8' '\u00C9' '\u00CA' '\u00CB' '\u00CC' '\u00CD' '\u00CE' '\u00CF' | |
* '\u00D0' '\u00D1' '\u00D2' '\u00D3' '\u00D4' '\u00D5' '\u00D6' '\u00D8' | |
* '\u00D9' '\u00DA' '\u00DB' '\u00DC' '\u00DD' '\u00DE' | |
* </pre></blockquote> | |
* <p> Many other Unicode codepoints are uppercase too.<p> | |
* | |
* @return <code>true</code> if the codepoint is uppercase; | |
* <code>false</code> otherwise. | |
*/ | |
public boolean isUpperCase() { | |
return Character.isUpperCase(data); | |
} | |
/** | |
* Determines if the codepoint is a titlecase codepoint. | |
* <p> | |
* A codepoint is a titlecase codepoint if its general | |
* category type, provided by <code>CodePoint.getType()</code>, | |
* is <code>TITLECASE_LETTER</code>. | |
* <p> | |
* Some codepoints look like pairs of Latin letters. For example, there | |
* is an uppercase letter that looks like "LJ" and has a corresponding | |
* lowercase letter that looks like "lj". A third form, which looks like "Lj", | |
* is the appropriate form to use when rendering a word in lowercase | |
* with initial capitals, as for a book title. | |
* <p> | |
* These are some of the Unicode codepoints for which this method returns | |
* <code>true</code>: | |
* <ul> | |
* <li><code>LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON</code> | |
* <li><code>LATIN CAPITAL LETTER L WITH SMALL LETTER J</code> | |
* <li><code>LATIN CAPITAL LETTER N WITH SMALL LETTER J</code> | |
* <li><code>LATIN CAPITAL LETTER D WITH SMALL LETTER Z</code> | |
* </ul> | |
* <p> Many other Unicode codepoints are titlecase too.<p> | |
* | |
* @return <code>true</code> if the codepoint is titlecase; | |
* <code>false</code> otherwise. | |
*/ | |
public boolean isTitleCase() { | |
return Character.isTitleCase(data); | |
} | |
/** | |
* Determines if the codepoint is a digit. | |
* <p> | |
* A codepoint is a digit if its general category type, provided | |
* by <code>CodePoint.getType()</code>, is | |
* <code>DECIMAL_DIGIT_NUMBER</code>. | |
* <p> | |
* Some Unicode codepoint ranges that contain digits: | |
* <ul> | |
* <li><code>'\u0030'</code> through <code>'\u0039'</code>, | |
* ISO-LATIN-1 digits (<code>'0'</code> through <code>'9'</code>) | |
* <li><code>'\u0660'</code> through <code>'\u0669'</code>, | |
* Arabic-Indic digits | |
* <li><code>'\u06F0'</code> through <code>'\u06F9'</code>, | |
* Extended Arabic-Indic digits | |
* <li><code>'\u0966'</code> through <code>'\u096F'</code>, | |
* Devanagari digits | |
* <li><code>'\uFF10'</code> through <code>'\uFF19'</code>, | |
* Fullwidth digits | |
* </ul> | |
* | |
* Many other codepoint ranges contain digits as well. | |
* | |
* @return <code>true</code> if the codepoint is a digit; | |
* <code>false</code> otherwise. | |
*/ | |
public boolean isDigit() { | |
return Character.isDigit(data); | |
} | |
/** | |
* Determines if a codepoint is defined in Unicode. | |
* <p> | |
* A codepoint is defined if at least one of the following is true: | |
* <ul> | |
* <li>It has an entry in the UnicodeData file. | |
* <li>It has a value in a range defined by the UnicodeData file. | |
* </ul> | |
* | |
* @return <code>true</code> if the codepoint has a defined meaning | |
* in Unicode; <code>false</code> otherwise. | |
*/ | |
public boolean isDefined() { | |
return Character.isDefined(data); | |
} | |
/** | |
* Determines if the codepoint is a letter. | |
* <p> | |
* A codepoint is considered to be a letter if its general | |
* category type, provided by <code>CodePoint.getType()</code>, | |
* is any of the following: | |
* <ul> | |
* <li> <code>UPPERCASE_LETTER</code> | |
* <li> <code>LOWERCASE_LETTER</code> | |
* <li> <code>TITLECASE_LETTER</code> | |
* <li> <code>MODIFIER_LETTER</code> | |
* <li> <code>OTHER_LETTER</code> | |
* </ul> | |
* | |
* Not all letters have case. Many codepoints are | |
* letters but are neither uppercase nor lowercase nor titlecase. | |
* | |
* @return <code>true</code> if the codepoints is a letter; | |
* <code>false</code> otherwise. | |
*/ | |
public boolean isLetter() { | |
return Character.isLetter(data); | |
} | |
/** | |
* Determines if the codepoint is a letter or digit. | |
* <p> | |
* A codepoints is considered to be a letter or digit if either | |
* <code>CodePoint.isLetter()</code> or | |
* <code>CodePoint.isDigit()</code> returns | |
* <code>true</code> for the codepoint. | |
* | |
* @return <code>true</code> if the codepoint is a letter or digit; | |
* <code>false</code> otherwise. | |
*/ | |
public boolean isLetterOrDigit() { | |
return Character.isLetterOrDigit(data); | |
} | |
/** | |
* Determines if the codepoint is | |
* permissible as the first codepoint in a Java identifier. | |
* <p> | |
* A codepoint may start a Java identifier if and only if | |
* one of the following conditions is true: | |
* <ul> | |
* <li> {@link #isLetter() isLetter()} returns <code>true</code> | |
* <li> {@link #getType() getType()} returns <code>LETTER_NUMBER</code> | |
* <li> codepoint is a currency symbol (such as "$") | |
* <li> codepoint is a connecting punctuation codepoint (such as "_"). | |
* </ul> | |
* | |
* @return <code>true</code> if the codepoint may start a Java identifier; | |
* <code>false</code> otherwise. | |
*/ | |
public boolean isJavaIdentifierStart() { | |
return Character.isJavaIdentifierStart(data); | |
} | |
/** | |
* Determines if the codepoint may be part of a Java | |
* identifier as other than the first codepoint. | |
* <p> | |
* A codepoint may be part of a Java identifier if any of the following | |
* are true: | |
* <ul> | |
* <li> it is a letter | |
* <li> it is a currency symbol (such as <code>'$'</code>) | |
* <li> it is a connecting punctuation codepoint (such as <code>'_'</code>) | |
* <li> it is a digit | |
* <li> it is a numeric letter (such as a Roman numeral codepoint) | |
* <li> it is a combining mark | |
* <li> it is a non-spacing mark | |
* <li> <code>isIdentifierIgnorable</code> returns | |
* <code>true</code> for the codepoint | |
* </ul> | |
* | |
* @return <code>true</code> if the codepoint may be part of a | |
* Java identifier; <code>false</code> otherwise. | |
*/ | |
public boolean isJavaIdentifierPart() { | |
return Character.isJavaIdentifierPart(data); | |
} | |
/** | |
* Determines if the codepoint is permissible as the | |
* first codepoint in a Unicode identifier. | |
* <p> | |
* A codepoint may start a Unicode identifier if and only if | |
* one of the following conditions is true: | |
* <ul> | |
* <li> {@link #isLetter() isLetter()} returns <code>true</code> | |
* <li> {@link #getType() getType()} returns | |
* <code>LETTER_NUMBER</code>. | |
* </ul> | |
* @return <code>true</code> if the codepoint may start a Unicode | |
* identifier; <code>false</code> otherwise. | |
*/ | |
public boolean isUnicodeIdentifierStart() { | |
return Character.isUnicodeIdentifierStart(data); | |
} | |
/** | |
* Determines if the codepoint may be part of a Unicode | |
* identifier as other than the first codepoint. | |
* <p> | |
* A codepoint may be part of a Unicode identifier if and only if | |
* one of the following statements is true: | |
* <ul> | |
* <li> it is a letter | |
* <li> it is a connecting punctuation codepoint (such as <code>'_'</code>) | |
* <li> it is a digit | |
* <li> it is a numeric letter (such as a Roman numeral codepoint) | |
* <li> it is a combining mark | |
* <li> it is a non-spacing mark | |
* <li> <code>isIdentifierIgnorable</code> returns | |
* <code>true</code> for this codepoint. | |
* </ul> | |
* | |
* @return <code>true</code> if the codepoint may be part of a | |
* Unicode identifier; <code>false</code> otherwise. | |
*/ | |
public boolean isUnicodeIdentifierPart() { | |
return Character.isUnicodeIdentifierPart(data); | |
} | |
/** | |
* Determines if the codepoint should be regarded as | |
* an ignorable codepoint in a Java identifier or a Unicode identifier. | |
* <p> | |
* The following Unicode codepoints are ignorable in a Java identifier | |
* or a Unicode identifier: | |
* <ul> | |
* <li>ISO control codepoints that are not whitespace | |
* <ul> | |
* <li><code>'\u0000'</code> through <code>'\u0008'</code> | |
* <li><code>'\u000E'</code> through <code>'\u001B'</code> | |
* <li><code>'\u007F'</code> through <code>'\u009F'</code> | |
* </ul> | |
* | |
* <li>all codepoints that have the <code>FORMAT</code> general | |
* category value | |
* </ul> | |
* | |
* @return <code>true</code> if the codepoint is an ignorable control | |
* codepoint that may be part of a Java or Unicode identifier; | |
* <code>false</code> otherwise. | |
*/ | |
public boolean isIdentifierIgnorable() { | |
return Character.isIdentifierIgnorable(data); | |
} | |
/** | |
* Converts the codepoint argument to lowercase using case | |
* mapping information from the UnicodeData file. | |
* <p> | |
* Note that | |
* <code>CodePoint.isLowerCase(CodePoint.toLowerCase())</code> | |
* does not always return <code>true</code> for some ranges of | |
* codepoint, particularly those that are symbols or ideographs. | |
* | |
* @return the lowercase equivalent of the codepoint, if any; | |
* otherwise, the codepoint itself. | |
*/ | |
public CodePoint toLowerCase() { | |
int lc = Character.toLowerCase(data); | |
return lc == data ? this : new CodePoint(lc); | |
} | |
/** | |
* Converts the codepoint argument to uppercase using case mapping | |
* information from the UnicodeData file. | |
* <p> | |
* Note that | |
* <code>CodePoint.isUpperCase(CodePoint.toUpperCase())</code> | |
* does not always return <code>true</code> for some ranges of | |
* codepoints, particularly those that are symbols or ideographs. | |
* | |
* @return the uppercase equivalent of the codepoint, if any; | |
* otherwise, an equal codepoint. | |
*/ | |
public CodePoint toUpperCase() { | |
int uc = Character.toUpperCase(data); | |
return uc == data ? this : new CodePoint(uc); | |
} | |
/** | |
* Converts the codepoint argument to titlecase using case mapping | |
* information from the UnicodeData file. If a codepoint has no | |
* explicit titlecase mapping and is not itself a titlecase codepoint | |
* according to UnicodeData, then the uppercase mapping is | |
* returned as an equivalent titlecase mapping. If the | |
* codepoint is already titlecase, the same codepoint value will be | |
* returned. | |
* <p> | |
* Note that | |
* <code>codepoint.isTitleCase(codepoint.toTitleCase())</code> | |
* does not always return <code>true</code> for some ranges of | |
* codepoints. | |
* | |
* @return the titlecase equivalent of the codepoint, if any; | |
* otherwise, an equal codepoint. | |
*/ | |
public CodePoint toTitleCase() { | |
int tc = Character.toTitleCase(data); | |
return tc == data ? this : new CodePoint(tc); | |
} | |
/** | |
* Returns the numeric value of the codepoint in the specified radix. | |
* <p> | |
* If the radix is not in the range <code>MIN_RADIX</code> <= | |
* <code>radix</code> <= <code>MAX_RADIX</code> or if the | |
* value of the codepoint is not a valid digit in the specified | |
* radix, <code>-1</code> is returned. A codepoint is a valid digit | |
* if at least one of the following is true: | |
* <ul> | |
* <li>The method <code>isDigit</code> is <code>true</code> of the codepoint | |
* and the Unicode decimal digit value of the codepoint (or its | |
* single-codepoint decomposition) is less than the specified radix. | |
* In this case the decimal digit value is returned. | |
* <li>The codepoint is one of the uppercase Latin letters | |
* <code>'A'</code> through <code>'Z'</code> and its code is less than | |
* <code>radix + 'A' - 10</code>. | |
* In this case, <code>code - 'A' + 10</code> | |
* is returned. | |
* <li>The codepoint is one of the lowercase Latin letters | |
* <code>'a'</code> through <code>'z'</code> and its code is less than | |
* <code>radix + 'a' - 10</code>. | |
* In this case, <code>code - 'a' + 10</code> | |
* is returned. | |
* </ul> | |
* | |
* @param radix the radix. | |
* @return the numeric value represented by the codepoint in the | |
* specified radix. | |
*/ | |
public int digit(int radix) { | |
return Character.digit(data, radix); | |
} | |
/** | |
* Returns the <code>int</code> value that the specified Unicode | |
* codepoint represents. For example, the codepoint | |
* <code>'\u216C'</code> (the roman numeral fifty) will return | |
* an int with a value of 50. | |
* <p> | |
* The letters A-Z in their uppercase (<code>'\u0041'</code> through | |
* <code>'\u005A'</code>), lowercase | |
* (<code>'\u0061'</code> through <code>'\u007A'</code>), and | |
* full width variant (<code>'\uFF21'</code> through | |
* <code>'\uFF3A'</code> and <code>'\uFF41'</code> through | |
* <code>'\uFF5A'</code>) forms have numeric values from 10 | |
* through 35. This is independent of the Unicode specification, | |
* which does not assign numeric values to these codepoint | |
* values. | |
* <p> | |
* If the codepoint does not have a numeric value, then -1 is returned. | |
* If the codepoint has a numeric value that cannot be represented as a | |
* nonnegative integer (for example, a fractional value), then -2 | |
* is returned. | |
* | |
* @return the numeric value of the codepoint, as a nonnegative <code>int</code> | |
* value; -2 if the codepoint has a numeric value that is not a | |
* nonnegative integer; -1 if the codepoint has no numeric value. | |
*/ | |
public int getNumericValue() { | |
return Character.getNumericValue(data); | |
} | |
/** | |
* Determines if the codepoint is a Unicode space codepoint. | |
* A codepoint is considered to be a space codepoint if and only if | |
* it is specified to be a space codepoint by the Unicode standard. This | |
* method returns true if the codepoint's general category type is any of | |
* the following: | |
* <ul> | |
* <li> <code>SPACE_SEPARATOR</code> | |
* <li> <code>LINE_SEPARATOR</code> | |
* <li> <code>PARAGRAPH_SEPARATOR</code> | |
* </ul> | |
* | |
* @return <code>true</code> if the codepoint is a space codepoint; | |
* <code>false</code> otherwise. | |
*/ | |
public boolean isSpaceChar() { | |
return Character.isSpaceChar(data); | |
} | |
/** | |
* Determines if the codepoint is white space according to Java. | |
* A codepoint is a Java whitespace codepoint if and only if it satisfies | |
* one of the following criteria: | |
* <ul> | |
* <li> It is a Unicode space codepoint (<code>SPACE_SEPARATOR</code>, | |
* <code>LINE_SEPARATOR</code>, or <code>PARAGRAPH_SEPARATOR</code>) | |
* but is not also a non-breaking space (<code>'\u00A0'</code>, | |
* <code>'\u2007'</code>, <code>'\u202F'</code>). | |
* <li> It is <code>'\u0009'</code>, HORIZONTAL TABULATION. | |
* <li> It is <code>'\u000A'</code>, LINE FEED. | |
* <li> It is <code>'\u000B'</code>, VERTICAL TABULATION. | |
* <li> It is <code>'\u000C'</code>, FORM FEED. | |
* <li> It is <code>'\u000D'</code>, CARRIAGE RETURN. | |
* <li> It is <code>'\u001C'</code>, FILE SEPARATOR. | |
* <li> It is <code>'\u001D'</code>, GROUP SEPARATOR. | |
* <li> It is <code>'\u001E'</code>, RECORD SEPARATOR. | |
* <li> It is <code>'\u001F'</code>, UNIT SEPARATOR. | |
* </ul> | |
* | |
* @return <code>true</code> if the codepoint is a Java whitespace | |
* codepoint; <code>false</code> otherwise. | |
*/ | |
public boolean isWhitespace() { | |
return Character.isWhitespace(data); | |
} | |
/** | |
* Determines if the codepoint is an ISO control | |
* codepoint. A codepoint is considered to be an ISO control | |
* codepoint if its code is in the range <code>'\u0000'</code> | |
* through <code>'\u001F'</code> or in the range | |
* <code>'\u007F'</code> through <code>'\u009F'</code>. | |
* | |
* @return <code>true</code> if the codepoint is an ISO control codepoint; | |
* <code>false</code> otherwise. | |
*/ | |
public boolean isISOControl() { | |
return Character.isISOControl(data); | |
} | |
/** | |
* Returns a value indicating a codepoint's general category. | |
* | |
* @return a value of type <code>int</code> representing the | |
* codepoint's general category. | |
*/ | |
public int getType() { | |
return Character.getType(data); | |
} | |
/** | |
* Returns the Unicode directionality property for the given | |
* codepoint. codepoint directionality is used to calculate the | |
* visual ordering of text. The directionality value of undefined | |
* <code>char</code> values is <code>DIRECTIONALITY_UNDEFINED</code>. | |
* | |
* @return the directionality property of the <code>char</code> value. | |
*/ | |
public byte getDirectionality() { | |
return Character.getDirectionality(data); | |
} | |
/** | |
* Determines whether the codepoint is mirrored according to the | |
* Unicode specification. Mirrored codepoints should have their | |
* glyphs horizontally mirrored when displayed in text that is | |
* right-to-left. For example, <code>'\u0028'</code> LEFT | |
* PARENTHESIS is semantically defined to be an <i>opening | |
* parenthesis</i>. This will appear as a "(" in text that is | |
* left-to-right but as a ")" in text that is right-to-left. | |
* | |
* @return <code>true</code> if the codepoint is mirrored, <code>false</code> | |
* if the is not mirrored or is not defined. | |
*/ | |
public boolean isMirrored() { | |
return Character.isMirrored(data); | |
} | |
/** | |
* Determines the number of char values needed to represent this codepoint. | |
* If the specified character is equal to or greater than 0x10000, then | |
* the method returns 2. Otherwise, the method returns 1. | |
*/ | |
public int charCount() { | |
return Character.charCount(data); | |
} | |
/** | |
* Determines whether the specified codepoint is in the supplementary | |
* character range. The method call is equivalent to the expression: | |
* <pre><code> codePoint >= 0x10000 && codePoint <= 0x10ffff</code></pre> | |
* | |
* @return <code>true</code> if the specified codepoint is in the Unicode | |
* supplementary character range; <code>false</code> otherwise. | |
*/ | |
public boolean isSupplementaryCodePoint() { | |
return Character.isSupplementaryCodePoint(data); | |
} | |
//////////////////////////////////////////////////////////////////////////////// | |
/////// The following are array methods, for handling char and Character strings | |
//////////////////////////////////////////////////////////////////////////////// | |
/** | |
* Convert an array of code points to an array of char in UTF-16. | |
* @param codePoints The CodePoint array. | |
* @return A UTF-16 array of char. | |
*/ | |
public static char[] toChars(CodePoint[] codePoints) { | |
char[] utf = new char[charCount(codePoints)]; | |
for (int c = 0, u = 0; c < codePoints.length; c++) u += Character.toChars(codePoints[c].data, utf, u); | |
return utf; | |
} | |
/** | |
* Convert an array of code points to an array of Characters in UTF-16. Prematurely optimized. | |
* @param codePoints The CodePoint array. | |
* @return A UTF-16 array of char. | |
*/ | |
public static Character[] toCharacters(CodePoint[] codePoints) { | |
Character[] utf = new Character[charCount(codePoints)]; | |
// iterate through copying to a 2 element char array, before autoboxing into the correct Character elements | |
char[] tmpUtf = new char[2]; | |
for (int c = 0, u = 0; c < codePoints.length; c++) { | |
int len = Character.toChars(codePoints[c].data, tmpUtf, 0); | |
utf[u] = tmpUtf[0]; | |
if (len == 2) utf[u + 1] = tmpUtf[1]; | |
u += len; | |
} | |
return utf; | |
} | |
/** | |
* Converts an array of char to an array of CodePoints. | |
* @param chars The char array in UTF-16. | |
* @return A CodePoint array, with the decoded unicode characters. | |
*/ | |
public static CodePoint[] toCodePoints(char[] chars) { | |
CodePoint[] codePoints = new CodePoint[Character.codePointCount(chars, 0, chars.length)]; | |
for (int i = 0; i < codePoints.length; i++) codePoints[i] = new CodePoint(chars[i]); | |
return codePoints; | |
} | |
/** | |
* Converts an array of characters to an array of CodePoints. | |
* @param characters The character array in UTF-16. | |
* @return A CodePoint array, with the decoded unicode characters. | |
*/ | |
public static CodePoint[] toCodePoints(Character[] characters) { | |
return toCodePoints(toChars(characters)); | |
} | |
/** | |
* Converts a String to an array of CodePoints. | |
* @param str The String to convert. | |
* @return A CodePoint array, with the decoded unicode characters. | |
*/ | |
public static CodePoint[] toCodePoints(String str) { | |
return toCodePoints(str.toCharArray()); | |
} | |
/** | |
* Returns a String object representing a CodePoint array. | |
* @param codePoints An array of CodePoints to convert to a string. | |
* @return A String version of a unicode codepoint array. | |
*/ | |
public static String toString(CodePoint[] codePoints) { | |
return new String(toChars(codePoints)); | |
} | |
/** | |
* Returns a list of CodePoints backed by the supplied CodePoint array. | |
* @param codePoints An array of CodePoints to convert to a List. | |
* @return A {@link java.util.List} of CodePoints. | |
*/ | |
public static java.util.List<CodePoint> toList(CodePoint[] codePoints) { | |
return java.util.Arrays.asList(codePoints); | |
} | |
/** | |
* Returns a list of CodePoints which is the equivalent of a Unicode String. | |
* @param str A String to convert to a list of CodePoints. | |
* @return A {@link java.util.List} of CodePoints, in the same order as the | |
* characters in <code>str</code>. | |
*/ | |
public static java.util.List<CodePoint> toList(String str) { | |
return java.util.Arrays.asList(toCodePoints(str)); | |
} | |
/** | |
* Converts a {@link java.util.Collection} of CodePoints into an array. | |
* @return An array of CodePoints. | |
*/ | |
public static CodePoint[] toArray(java.util.Collection<CodePoint> codePointCollection) { | |
return codePointCollection.toArray(new CodePoint[codePointCollection.size()]); | |
} | |
/** | |
* Gets the number of char values needed to represent an array of CodePoints. | |
* @param codePoints The array of CodePoints to measure. | |
*/ | |
public static int charCount(CodePoint[] codePoints) { | |
int total = 0; | |
for (int i = 0; i < codePoints.length; i++) total += Character.charCount(codePoints[i].data); | |
return total; | |
} | |
//////////////////////////////////////////////////////////////////////////////// | |
/////// The following are private helper methods, for converting char and | |
/////// Character strings to the other type. | |
//////////////////////////////////////////////////////////////////////////////// | |
/** | |
* Converts an array of java.lang.Character to an array of char. | |
* @param characters The character array to convert. | |
* @return An equivalent array of char. | |
*/ | |
private static final char[] toChars(Character[] characters) { | |
char[] chars = new char[characters.length]; | |
for (int i = 0; i < characters.length; i++) chars[i] = characters[i]; | |
return chars; | |
} | |
/** | |
* Converts an array of char to an array of java.lang.Character. | |
* @param chars The char array to convert. | |
* @return An equivalent array of Character. | |
*/ | |
private static final Character[] toCharacters(char[] chars) { | |
Character[] characters = new Character[chars.length]; | |
for (int i = 0; i < chars.length; i++) characters[i] = chars[i]; | |
return characters; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment