Skip to content

Instantly share code, notes, and snippets.

@quoll
Created May 21, 2013 17:07
Show Gist options
  • Save quoll/5621440 to your computer and use it in GitHub Desktop.
Save quoll/5621440 to your computer and use it in GitHub Desktop.
Provides codepoint support as an extension to what java.lang.Character can do
package util;
import java.util.*;
/**
* This method fills in where java.lang.Character can't manage 21 bit Unicode.
* Represents Unicode Scalar Values, U+0000 to U+10FFFF
*
* @author Paul Gearon
*/
public class CodePoint implements java.io.Serializable, Comparable<CodePoint> {
/** The serialization ID. */
private static final long serialVersionUID = 3212781738993088980L;
/** The internal representation. */
private final int data;
/** The maximum value of a Unicode code point. */
public static final CodePoint MAX_VALUE = new CodePoint(Character.MAX_CODE_POINT);
/** The minimum value of a Unicode code point. */
public static final CodePoint MIN_VALUE = new CodePoint(Character.MIN_CODE_POINT);
/**
* Constructs a new CodePoint for a given Character value.
* @param value The value to be represented by the CodePoint.
*/
public CodePoint(Character value) {
data = value.charValue();
}
/**
* Constructs a new CodePoint for a given char value.
* @param value The value to be represented by the CodePoint.
*/
public CodePoint(char value) {
data = value;
}
/**
* Constructs a new CodePoint for a given Unicode scalar value.
* @param value The Unicode scalar value of this CodePoint.
*/
public CodePoint(int value) {
if (!Character.isValidCodePoint(value)) throw new IllegalArgumentException("Code point out of range");
data = value;
}
/**
* Constructs a new codepoint for a given char array.
* @param value The value to be represented by the codepoint in UTF-16.
*/
public CodePoint(char[] value) {
data = Character.codePointAt(value, 0);
}
/**
* Constructs a new codepoint for a given Character array.
* @param value The value to be represented by the codepoint in UTF-16.
*/
public CodePoint(Character[] value) {
char[] tmpValue;
if (Character.isHighSurrogate(value[0])) {
if (value.length < 2) throw new IllegalArgumentException("Malformed UniCode character array");
tmpValue = new char[] { value[0], value[1] };
} else {
tmpValue = new char[] { value[0] };
}
data = Character.codePointAt(tmpValue, 0);
}
/**
* Constructs a new codepoint from an offset into a given char array.
* @param chars The UTF-16 array containing the CodePoints.
* @param offset The offset into the array to read from.
*/
public CodePoint(char[] chars, int offset) {
data = Character.codePointAt(chars, offset);
}
/**
* Constructs a new codepoint for a given Character array.
* @param characters The UTF-16 array containing the CodePoints.
* @param offset The offset into the array to read from.
*/
public CodePoint(Character[] characters, int offset) {
char[] tmpValue;
if (Character.isHighSurrogate(characters[0].charValue())) {
if (characters.length - offset < 2) throw new IllegalArgumentException("Malformed UniCode character array");
tmpValue = new char[] { characters[offset], characters[offset + 1] };
} else {
tmpValue = new char[] { characters[offset] };
}
data = Character.codePointAt(tmpValue, 0);
}
/**
* Gets a Unicode scalar value for use with java.lang.Character methods.
* @return The UniCode scalar value of this CodePoint.
*/
public int intValue() {
return data;
}
/**
* Convert this code point to its UTF-16 representation stored in a char array.
* @return An array containing the UTF-16 representation of the current code point.
*/
public char[] toChars() {
return Character.toChars(data);
}
/**
* Convert this code point to its UTF-16 representation stored in a provided char array.
* @param dst An array of char in which the codePoint's UTF-16 value is stored.
* @param dstIndex The start index into the dst array where the converted value is stored.
* @return 1 if the code point is a BMP code point, 2 if the code point is a supplementary code point.
*/
public int toChars(char[] dst, int dstIndex) {
return Character.toChars(data, dst, dstIndex);
}
/**
* Convert this code point to its UTF-16 representation stored in a Character array.
* @return An array containing the UTF-16 representation of the current code point.
*/
public Character[] toCharacters() {
return toCharacters(Character.toChars(data));
}
/**
* Convert this code point to its UTF-16 representation stored in a provided char array.
* @param dst An array of char in which the codePoint's UTF-16 value is stored.
* @param dstIndex The start index into the dst array where the converted value is stored.
* @return 1 if the code point is a BMP code point, 2 if the code point is a supplementary code point.
*/
public int toCharacters(Character[] dst, int dstIndex) {
char[] tmpDst = new char[2];
int size = Character.toChars(data, tmpDst, 0);
dst[dstIndex] = tmpDst[0];
if (size == 2) dst[dstIndex + 1] = tmpDst[1];
return size;
}
/**
* Tests for equality.
* @param o The object to compare to.
* @return <code>true</code> if o is equal to this object.
*/
public boolean equals(Object o) {
return (o instanceof CodePoint) && ((CodePoint)o).data == data;
}
/**
* Gets a hashcode for this object.
* @return An integer hash code.
*/
public int hashCode() {
return data;
}
/**
* Performs a comparison on another CodePoint object.
* @param c The CodePoint to compare to.
* @return A negative integer, zero, or a positive integer as this object is less than, equal to,
* or greater than the specified object.
*/
public int compareTo(CodePoint c) {
// bit ranges guarantee that we can take data-c.data, but this is poor practice in general
if (data == c.data) return 0;
if (data < c.data) return -1;
return 1;
}
/**
* Returns a String object representing this codepoint's value.
* @return A String version of this unicode character.
*/
public String toString() {
return new String(toChars());
}
/**
* Determines if the specified codepoint is lowercase.
* <p>
* A codepoint is lowercase if its general category type, provided
* by <code>CodePoint.getType()</code>, is
* <code>LOWERCASE_LETTER</code>.
* <p>
* The following are examples of lowercase codepoints:
* <p><blockquote><pre>
* a b c d e f g h i j k l m n o p q r s t u v w x y z
* '&#92;u00DF' '&#92;u00E0' '&#92;u00E1' '&#92;u00E2' '&#92;u00E3' '&#92;u00E4' '&#92;u00E5' '&#92;u00E6'
* '&#92;u00E7' '&#92;u00E8' '&#92;u00E9' '&#92;u00EA' '&#92;u00EB' '&#92;u00EC' '&#92;u00ED' '&#92;u00EE'
* '&#92;u00EF' '&#92;u00F0' '&#92;u00F1' '&#92;u00F2' '&#92;u00F3' '&#92;u00F4' '&#92;u00F5' '&#92;u00F6'
* '&#92;u00F8' '&#92;u00F9' '&#92;u00FA' '&#92;u00FB' '&#92;u00FC' '&#92;u00FD' '&#92;u00FE' '&#92;u00FF'
* </pre></blockquote>
* <p> Many other Unicode codepoints are lowercase too.
* <p>
*
* @return <code>true</code> if the codepoint is lowercase;
* <code>false</code> otherwise.
*/
public boolean isLowerCase() {
return Character.isLowerCase(data);
}
/**
* Determines if the codepoint is an uppercase codepoint.
* <p>
* A codepoint is uppercase if its general category type, provided by
* <code>sodePoint.getType()</code>, is <code>UPPERCASE_LETTER</code>.
* <p>
* The following are examples of uppercase codepoints:
* <p><blockquote><pre>
* A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
* '&#92;u00C0' '&#92;u00C1' '&#92;u00C2' '&#92;u00C3' '&#92;u00C4' '&#92;u00C5' '&#92;u00C6' '&#92;u00C7'
* '&#92;u00C8' '&#92;u00C9' '&#92;u00CA' '&#92;u00CB' '&#92;u00CC' '&#92;u00CD' '&#92;u00CE' '&#92;u00CF'
* '&#92;u00D0' '&#92;u00D1' '&#92;u00D2' '&#92;u00D3' '&#92;u00D4' '&#92;u00D5' '&#92;u00D6' '&#92;u00D8'
* '&#92;u00D9' '&#92;u00DA' '&#92;u00DB' '&#92;u00DC' '&#92;u00DD' '&#92;u00DE'
* </pre></blockquote>
* <p> Many other Unicode codepoints are uppercase too.<p>
*
* @return <code>true</code> if the codepoint is uppercase;
* <code>false</code> otherwise.
*/
public boolean isUpperCase() {
return Character.isUpperCase(data);
}
/**
* Determines if the codepoint is a titlecase codepoint.
* <p>
* A codepoint is a titlecase codepoint if its general
* category type, provided by <code>CodePoint.getType()</code>,
* is <code>TITLECASE_LETTER</code>.
* <p>
* Some codepoints look like pairs of Latin letters. For example, there
* is an uppercase letter that looks like "LJ" and has a corresponding
* lowercase letter that looks like "lj". A third form, which looks like "Lj",
* is the appropriate form to use when rendering a word in lowercase
* with initial capitals, as for a book title.
* <p>
* These are some of the Unicode codepoints for which this method returns
* <code>true</code>:
* <ul>
* <li><code>LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON</code>
* <li><code>LATIN CAPITAL LETTER L WITH SMALL LETTER J</code>
* <li><code>LATIN CAPITAL LETTER N WITH SMALL LETTER J</code>
* <li><code>LATIN CAPITAL LETTER D WITH SMALL LETTER Z</code>
* </ul>
* <p> Many other Unicode codepoints are titlecase too.<p>
*
* @return <code>true</code> if the codepoint is titlecase;
* <code>false</code> otherwise.
*/
public boolean isTitleCase() {
return Character.isTitleCase(data);
}
/**
* Determines if the codepoint is a digit.
* <p>
* A codepoint is a digit if its general category type, provided
* by <code>CodePoint.getType()</code>, is
* <code>DECIMAL_DIGIT_NUMBER</code>.
* <p>
* Some Unicode codepoint ranges that contain digits:
* <ul>
* <li><code>'&#92;u0030'</code> through <code>'&#92;u0039'</code>,
* ISO-LATIN-1 digits (<code>'0'</code> through <code>'9'</code>)
* <li><code>'&#92;u0660'</code> through <code>'&#92;u0669'</code>,
* Arabic-Indic digits
* <li><code>'&#92;u06F0'</code> through <code>'&#92;u06F9'</code>,
* Extended Arabic-Indic digits
* <li><code>'&#92;u0966'</code> through <code>'&#92;u096F'</code>,
* Devanagari digits
* <li><code>'&#92;uFF10'</code> through <code>'&#92;uFF19'</code>,
* Fullwidth digits
* </ul>
*
* Many other codepoint ranges contain digits as well.
*
* @return <code>true</code> if the codepoint is a digit;
* <code>false</code> otherwise.
*/
public boolean isDigit() {
return Character.isDigit(data);
}
/**
* Determines if a codepoint is defined in Unicode.
* <p>
* A codepoint is defined if at least one of the following is true:
* <ul>
* <li>It has an entry in the UnicodeData file.
* <li>It has a value in a range defined by the UnicodeData file.
* </ul>
*
* @return <code>true</code> if the codepoint has a defined meaning
* in Unicode; <code>false</code> otherwise.
*/
public boolean isDefined() {
return Character.isDefined(data);
}
/**
* Determines if the codepoint is a letter.
* <p>
* A codepoint is considered to be a letter if its general
* category type, provided by <code>CodePoint.getType()</code>,
* is any of the following:
* <ul>
* <li> <code>UPPERCASE_LETTER</code>
* <li> <code>LOWERCASE_LETTER</code>
* <li> <code>TITLECASE_LETTER</code>
* <li> <code>MODIFIER_LETTER</code>
* <li> <code>OTHER_LETTER</code>
* </ul>
*
* Not all letters have case. Many codepoints are
* letters but are neither uppercase nor lowercase nor titlecase.
*
* @return <code>true</code> if the codepoints is a letter;
* <code>false</code> otherwise.
*/
public boolean isLetter() {
return Character.isLetter(data);
}
/**
* Determines if the codepoint is a letter or digit.
* <p>
* A codepoints is considered to be a letter or digit if either
* <code>CodePoint.isLetter()</code> or
* <code>CodePoint.isDigit()</code> returns
* <code>true</code> for the codepoint.
*
* @return <code>true</code> if the codepoint is a letter or digit;
* <code>false</code> otherwise.
*/
public boolean isLetterOrDigit() {
return Character.isLetterOrDigit(data);
}
/**
* Determines if the codepoint is
* permissible as the first codepoint in a Java identifier.
* <p>
* A codepoint may start a Java identifier if and only if
* one of the following conditions is true:
* <ul>
* <li> {@link #isLetter() isLetter()} returns <code>true</code>
* <li> {@link #getType() getType()} returns <code>LETTER_NUMBER</code>
* <li> codepoint is a currency symbol (such as "$")
* <li> codepoint is a connecting punctuation codepoint (such as "_").
* </ul>
*
* @return <code>true</code> if the codepoint may start a Java identifier;
* <code>false</code> otherwise.
*/
public boolean isJavaIdentifierStart() {
return Character.isJavaIdentifierStart(data);
}
/**
* Determines if the codepoint may be part of a Java
* identifier as other than the first codepoint.
* <p>
* A codepoint may be part of a Java identifier if any of the following
* are true:
* <ul>
* <li> it is a letter
* <li> it is a currency symbol (such as <code>'$'</code>)
* <li> it is a connecting punctuation codepoint (such as <code>'_'</code>)
* <li> it is a digit
* <li> it is a numeric letter (such as a Roman numeral codepoint)
* <li> it is a combining mark
* <li> it is a non-spacing mark
* <li> <code>isIdentifierIgnorable</code> returns
* <code>true</code> for the codepoint
* </ul>
*
* @return <code>true</code> if the codepoint may be part of a
* Java identifier; <code>false</code> otherwise.
*/
public boolean isJavaIdentifierPart() {
return Character.isJavaIdentifierPart(data);
}
/**
* Determines if the codepoint is permissible as the
* first codepoint in a Unicode identifier.
* <p>
* A codepoint may start a Unicode identifier if and only if
* one of the following conditions is true:
* <ul>
* <li> {@link #isLetter() isLetter()} returns <code>true</code>
* <li> {@link #getType() getType()} returns
* <code>LETTER_NUMBER</code>.
* </ul>
* @return <code>true</code> if the codepoint may start a Unicode
* identifier; <code>false</code> otherwise.
*/
public boolean isUnicodeIdentifierStart() {
return Character.isUnicodeIdentifierStart(data);
}
/**
* Determines if the codepoint may be part of a Unicode
* identifier as other than the first codepoint.
* <p>
* A codepoint may be part of a Unicode identifier if and only if
* one of the following statements is true:
* <ul>
* <li> it is a letter
* <li> it is a connecting punctuation codepoint (such as <code>'_'</code>)
* <li> it is a digit
* <li> it is a numeric letter (such as a Roman numeral codepoint)
* <li> it is a combining mark
* <li> it is a non-spacing mark
* <li> <code>isIdentifierIgnorable</code> returns
* <code>true</code> for this codepoint.
* </ul>
*
* @return <code>true</code> if the codepoint may be part of a
* Unicode identifier; <code>false</code> otherwise.
*/
public boolean isUnicodeIdentifierPart() {
return Character.isUnicodeIdentifierPart(data);
}
/**
* Determines if the codepoint should be regarded as
* an ignorable codepoint in a Java identifier or a Unicode identifier.
* <p>
* The following Unicode codepoints are ignorable in a Java identifier
* or a Unicode identifier:
* <ul>
* <li>ISO control codepoints that are not whitespace
* <ul>
* <li><code>'&#92;u0000'</code> through <code>'&#92;u0008'</code>
* <li><code>'&#92;u000E'</code> through <code>'&#92;u001B'</code>
* <li><code>'&#92;u007F'</code> through <code>'&#92;u009F'</code>
* </ul>
*
* <li>all codepoints that have the <code>FORMAT</code> general
* category value
* </ul>
*
* @return <code>true</code> if the codepoint is an ignorable control
* codepoint that may be part of a Java or Unicode identifier;
* <code>false</code> otherwise.
*/
public boolean isIdentifierIgnorable() {
return Character.isIdentifierIgnorable(data);
}
/**
* Converts the codepoint argument to lowercase using case
* mapping information from the UnicodeData file.
* <p>
* Note that
* <code>CodePoint.isLowerCase(CodePoint.toLowerCase())</code>
* does not always return <code>true</code> for some ranges of
* codepoint, particularly those that are symbols or ideographs.
*
* @return the lowercase equivalent of the codepoint, if any;
* otherwise, the codepoint itself.
*/
public CodePoint toLowerCase() {
int lc = Character.toLowerCase(data);
return lc == data ? this : new CodePoint(lc);
}
/**
* Converts the codepoint argument to uppercase using case mapping
* information from the UnicodeData file.
* <p>
* Note that
* <code>CodePoint.isUpperCase(CodePoint.toUpperCase())</code>
* does not always return <code>true</code> for some ranges of
* codepoints, particularly those that are symbols or ideographs.
*
* @return the uppercase equivalent of the codepoint, if any;
* otherwise, an equal codepoint.
*/
public CodePoint toUpperCase() {
int uc = Character.toUpperCase(data);
return uc == data ? this : new CodePoint(uc);
}
/**
* Converts the codepoint argument to titlecase using case mapping
* information from the UnicodeData file. If a codepoint has no
* explicit titlecase mapping and is not itself a titlecase codepoint
* according to UnicodeData, then the uppercase mapping is
* returned as an equivalent titlecase mapping. If the
* codepoint is already titlecase, the same codepoint value will be
* returned.
* <p>
* Note that
* <code>codepoint.isTitleCase(codepoint.toTitleCase())</code>
* does not always return <code>true</code> for some ranges of
* codepoints.
*
* @return the titlecase equivalent of the codepoint, if any;
* otherwise, an equal codepoint.
*/
public CodePoint toTitleCase() {
int tc = Character.toTitleCase(data);
return tc == data ? this : new CodePoint(tc);
}
/**
* Returns the numeric value of the codepoint in the specified radix.
* <p>
* If the radix is not in the range <code>MIN_RADIX</code>&nbsp;&lt;=
* <code>radix</code>&nbsp;&lt;= <code>MAX_RADIX</code> or if the
* value of the codepoint is not a valid digit in the specified
* radix, <code>-1</code> is returned. A codepoint is a valid digit
* if at least one of the following is true:
* <ul>
* <li>The method <code>isDigit</code> is <code>true</code> of the codepoint
* and the Unicode decimal digit value of the codepoint (or its
* single-codepoint decomposition) is less than the specified radix.
* In this case the decimal digit value is returned.
* <li>The codepoint is one of the uppercase Latin letters
* <code>'A'</code> through <code>'Z'</code> and its code is less than
* <code>radix&nbsp;+ 'A'&nbsp;-&nbsp;10</code>.
* In this case, <code>code&nbsp;- 'A'&nbsp;+&nbsp;10</code>
* is returned.
* <li>The codepoint is one of the lowercase Latin letters
* <code>'a'</code> through <code>'z'</code> and its code is less than
* <code>radix&nbsp;+ 'a'&nbsp;-&nbsp;10</code>.
* In this case, <code>code&nbsp;- 'a'&nbsp;+&nbsp;10</code>
* is returned.
* </ul>
*
* @param radix the radix.
* @return the numeric value represented by the codepoint in the
* specified radix.
*/
public int digit(int radix) {
return Character.digit(data, radix);
}
/**
* Returns the <code>int</code> value that the specified Unicode
* codepoint represents. For example, the codepoint
* <code>'&#92;u216C'</code> (the roman numeral fifty) will return
* an int with a value of 50.
* <p>
* The letters A-Z in their uppercase (<code>'&#92;u0041'</code> through
* <code>'&#92;u005A'</code>), lowercase
* (<code>'&#92;u0061'</code> through <code>'&#92;u007A'</code>), and
* full width variant (<code>'&#92;uFF21'</code> through
* <code>'&#92;uFF3A'</code> and <code>'&#92;uFF41'</code> through
* <code>'&#92;uFF5A'</code>) forms have numeric values from 10
* through 35. This is independent of the Unicode specification,
* which does not assign numeric values to these codepoint
* values.
* <p>
* If the codepoint does not have a numeric value, then -1 is returned.
* If the codepoint has a numeric value that cannot be represented as a
* nonnegative integer (for example, a fractional value), then -2
* is returned.
*
* @return the numeric value of the codepoint, as a nonnegative <code>int</code>
* value; -2 if the codepoint has a numeric value that is not a
* nonnegative integer; -1 if the codepoint has no numeric value.
*/
public int getNumericValue() {
return Character.getNumericValue(data);
}
/**
* Determines if the codepoint is a Unicode space codepoint.
* A codepoint is considered to be a space codepoint if and only if
* it is specified to be a space codepoint by the Unicode standard. This
* method returns true if the codepoint's general category type is any of
* the following:
* <ul>
* <li> <code>SPACE_SEPARATOR</code>
* <li> <code>LINE_SEPARATOR</code>
* <li> <code>PARAGRAPH_SEPARATOR</code>
* </ul>
*
* @return <code>true</code> if the codepoint is a space codepoint;
* <code>false</code> otherwise.
*/
public boolean isSpaceChar() {
return Character.isSpaceChar(data);
}
/**
* Determines if the codepoint is white space according to Java.
* A codepoint is a Java whitespace codepoint if and only if it satisfies
* one of the following criteria:
* <ul>
* <li> It is a Unicode space codepoint (<code>SPACE_SEPARATOR</code>,
* <code>LINE_SEPARATOR</code>, or <code>PARAGRAPH_SEPARATOR</code>)
* but is not also a non-breaking space (<code>'&#92;u00A0'</code>,
* <code>'&#92;u2007'</code>, <code>'&#92;u202F'</code>).
* <li> It is <code>'&#92;u0009'</code>, HORIZONTAL TABULATION.
* <li> It is <code>'&#92;u000A'</code>, LINE FEED.
* <li> It is <code>'&#92;u000B'</code>, VERTICAL TABULATION.
* <li> It is <code>'&#92;u000C'</code>, FORM FEED.
* <li> It is <code>'&#92;u000D'</code>, CARRIAGE RETURN.
* <li> It is <code>'&#92;u001C'</code>, FILE SEPARATOR.
* <li> It is <code>'&#92;u001D'</code>, GROUP SEPARATOR.
* <li> It is <code>'&#92;u001E'</code>, RECORD SEPARATOR.
* <li> It is <code>'&#92;u001F'</code>, UNIT SEPARATOR.
* </ul>
*
* @return <code>true</code> if the codepoint is a Java whitespace
* codepoint; <code>false</code> otherwise.
*/
public boolean isWhitespace() {
return Character.isWhitespace(data);
}
/**
* Determines if the codepoint is an ISO control
* codepoint. A codepoint is considered to be an ISO control
* codepoint if its code is in the range <code>'&#92;u0000'</code>
* through <code>'&#92;u001F'</code> or in the range
* <code>'&#92;u007F'</code> through <code>'&#92;u009F'</code>.
*
* @return <code>true</code> if the codepoint is an ISO control codepoint;
* <code>false</code> otherwise.
*/
public boolean isISOControl() {
return Character.isISOControl(data);
}
/**
* Returns a value indicating a codepoint's general category.
*
* @return a value of type <code>int</code> representing the
* codepoint's general category.
*/
public int getType() {
return Character.getType(data);
}
/**
* Returns the Unicode directionality property for the given
* codepoint. codepoint directionality is used to calculate the
* visual ordering of text. The directionality value of undefined
* <code>char</code> values is <code>DIRECTIONALITY_UNDEFINED</code>.
*
* @return the directionality property of the <code>char</code> value.
*/
public byte getDirectionality() {
return Character.getDirectionality(data);
}
/**
* Determines whether the codepoint is mirrored according to the
* Unicode specification. Mirrored codepoints should have their
* glyphs horizontally mirrored when displayed in text that is
* right-to-left. For example, <code>'&#92;u0028'</code> LEFT
* PARENTHESIS is semantically defined to be an <i>opening
* parenthesis</i>. This will appear as a "(" in text that is
* left-to-right but as a ")" in text that is right-to-left.
*
* @return <code>true</code> if the codepoint is mirrored, <code>false</code>
* if the is not mirrored or is not defined.
*/
public boolean isMirrored() {
return Character.isMirrored(data);
}
/**
* Determines the number of char values needed to represent this codepoint.
* If the specified character is equal to or greater than 0x10000, then
* the method returns 2. Otherwise, the method returns 1.
*/
public int charCount() {
return Character.charCount(data);
}
/**
* Determines whether the specified codepoint is in the supplementary
* character range. The method call is equivalent to the expression:
* <pre><code> codePoint &gt;= 0x10000 &amp;&amp; codePoint &lt;= 0x10ffff</code></pre>
*
* @return <code>true</code> if the specified codepoint is in the Unicode
* supplementary character range; <code>false</code> otherwise.
*/
public boolean isSupplementaryCodePoint() {
return Character.isSupplementaryCodePoint(data);
}
////////////////////////////////////////////////////////////////////////////////
/////// The following are array methods, for handling char and Character strings
////////////////////////////////////////////////////////////////////////////////
/**
* Convert an array of code points to an array of char in UTF-16.
* @param codePoints The CodePoint array.
* @return A UTF-16 array of char.
*/
public static char[] toChars(CodePoint[] codePoints) {
char[] utf = new char[charCount(codePoints)];
for (int c = 0, u = 0; c < codePoints.length; c++) u += Character.toChars(codePoints[c].data, utf, u);
return utf;
}
/**
* Convert an array of code points to an array of Characters in UTF-16. Prematurely optimized.
* @param codePoints The CodePoint array.
* @return A UTF-16 array of char.
*/
public static Character[] toCharacters(CodePoint[] codePoints) {
Character[] utf = new Character[charCount(codePoints)];
// iterate through copying to a 2 element char array, before autoboxing into the correct Character elements
char[] tmpUtf = new char[2];
for (int c = 0, u = 0; c < codePoints.length; c++) {
int len = Character.toChars(codePoints[c].data, tmpUtf, 0);
utf[u] = tmpUtf[0];
if (len == 2) utf[u + 1] = tmpUtf[1];
u += len;
}
return utf;
}
/**
* Converts an array of char to an array of CodePoints.
* @param chars The char array in UTF-16.
* @return A CodePoint array, with the decoded unicode characters.
*/
public static CodePoint[] toCodePoints(char[] chars) {
CodePoint[] codePoints = new CodePoint[Character.codePointCount(chars, 0, chars.length)];
for (int i = 0; i < codePoints.length; i++) codePoints[i] = new CodePoint(chars[i]);
return codePoints;
}
/**
* Converts an array of characters to an array of CodePoints.
* @param characters The character array in UTF-16.
* @return A CodePoint array, with the decoded unicode characters.
*/
public static CodePoint[] toCodePoints(Character[] characters) {
return toCodePoints(toChars(characters));
}
/**
* Converts a String to an array of CodePoints.
* @param str The String to convert.
* @return A CodePoint array, with the decoded unicode characters.
*/
public static CodePoint[] toCodePoints(String str) {
return toCodePoints(str.toCharArray());
}
/**
* Returns a String object representing a CodePoint array.
* @param codePoints An array of CodePoints to convert to a string.
* @return A String version of a unicode codepoint array.
*/
public static String toString(CodePoint[] codePoints) {
return new String(toChars(codePoints));
}
/**
* Returns a list of CodePoints backed by the supplied CodePoint array.
* @param codePoints An array of CodePoints to convert to a List.
* @return A {@link java.util.List} of CodePoints.
*/
public static java.util.List<CodePoint> toList(CodePoint[] codePoints) {
return java.util.Arrays.asList(codePoints);
}
/**
* Returns a list of CodePoints which is the equivalent of a Unicode String.
* @param str A String to convert to a list of CodePoints.
* @return A {@link java.util.List} of CodePoints, in the same order as the
* characters in <code>str</code>.
*/
public static java.util.List<CodePoint> toList(String str) {
return java.util.Arrays.asList(toCodePoints(str));
}
/**
* Converts a {@link java.util.Collection} of CodePoints into an array.
* @return An array of CodePoints.
*/
public static CodePoint[] toArray(java.util.Collection<CodePoint> codePointCollection) {
return codePointCollection.toArray(new CodePoint[codePointCollection.size()]);
}
/**
* Gets the number of char values needed to represent an array of CodePoints.
* @param codePoints The array of CodePoints to measure.
*/
public static int charCount(CodePoint[] codePoints) {
int total = 0;
for (int i = 0; i < codePoints.length; i++) total += Character.charCount(codePoints[i].data);
return total;
}
////////////////////////////////////////////////////////////////////////////////
/////// The following are private helper methods, for converting char and
/////// Character strings to the other type.
////////////////////////////////////////////////////////////////////////////////
/**
* Converts an array of java.lang.Character to an array of char.
* @param characters The character array to convert.
* @return An equivalent array of char.
*/
private static final char[] toChars(Character[] characters) {
char[] chars = new char[characters.length];
for (int i = 0; i < characters.length; i++) chars[i] = characters[i];
return chars;
}
/**
* Converts an array of char to an array of java.lang.Character.
* @param chars The char array to convert.
* @return An equivalent array of Character.
*/
private static final Character[] toCharacters(char[] chars) {
Character[] characters = new Character[chars.length];
for (int i = 0; i < chars.length; i++) characters[i] = chars[i];
return characters;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment