Last active
September 16, 2021 22:34
Iterating over the "letters" (codepoints, not chars) of a string.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.emmanueloga.cracking.arrays; | |
import java.util.Iterator; | |
import com.google.common.collect.ImmutableList; | |
import com.google.common.collect.Lists; | |
class StringCodepointsIterable implements Iterable<String> { | |
public class StringCodepointsIterator implements Iterator<String> { | |
private int index = 0; | |
@Override | |
public void remove() { | |
throw new UnsupportedOperationException(); | |
} | |
@Override | |
public boolean hasNext() { | |
return index < StringCodepointsIterable.this.string.length(); | |
} | |
@Override | |
public String next() { | |
int codePoint = StringCodepointsIterable.this.string.codePointAt(index); | |
index += Character.charCount(codePoint); | |
return new String(Character.toChars(codePoint)); | |
} | |
} | |
private final String string; | |
public StringCodepointsIterable(final String string) { | |
this.string = string; | |
} | |
@Override | |
public Iterator<String> iterator() { | |
return new StringCodepointsIterator(); | |
} | |
} | |
public class IterateCodepoints { | |
public static void main(String... args) { | |
// Uses the beautiful Mathematical Capital Script C char: | |
// http://unicode-table.com/en/1D49E/ | |
String unicode = new String(Character.toChars(0x1d49e)) + ":Hi"; | |
System.out.print("String: " + unicode + "(length " + unicode.length() | |
+ ", " + unicode.codePointCount(0, unicode.length()) | |
+ " codepoints)"); | |
System.out.print("\ncharAt for each index in (0...str.length() - 1): "); | |
for (int i = 0; i < unicode.length(); i++) { | |
System.out.print(unicode.charAt(i)); | |
System.out.print("_"); | |
} | |
System.out.print("\nGuava's Lists.charactersOf: "); | |
ImmutableList<Character> chars = Lists.charactersOf(unicode); | |
for (Character c : chars) { | |
System.out.print(c); | |
System.out.print("_"); | |
} | |
System.out.print("\nCODEPOINTS: "); | |
for (int i = 0; i < unicode.length();) { | |
int cp = unicode.codePointAt(i); | |
System.out.print(Character.toChars(cp)); | |
System.out.print("_"); | |
i += Character.isSupplementaryCodePoint(cp) ? 2 : 1; | |
} | |
System.out.print("\nCODEPOINTS INTSTREAM: "); | |
unicode.codePoints().forEach(c -> { | |
System.out.print(Character.toChars(c)); | |
System.out.print("_"); | |
}); | |
System.out.print("\nStringCodepointsIterable: "); | |
for (String stringOfSingleCodepoing : new StringCodepointsIterable( | |
unicode)) { | |
System.out.print(stringOfSingleCodepoing); | |
System.out.print("_"); | |
} | |
} | |
} | |
// Output: | |
// String: π:Hi(length 5, 4 codepoints) | |
// charAt for each index in (0...str.length() - 1): ?_?_:_H_i_ | |
// Guava's Lists.charactersOf: ?_?_:_H_i_ | |
// CODEPOINTS: π_:_H_i_ | |
// CODEPOINTS INTSTREAM: π_:_H_i_ | |
// StringCodepointsIterable: π_:_H_i_ |
@EmmanuelOga - we've been using code based on this gist in https://poi.apache.org/ (an open source project) for a while. A query has come up about whether we should be doing this without your approval. If it all possible, would you be able to Apache license this gist (by maybe editing the code snippet and prepending a license as a comment). If you object, we can look at removing the code from POI at the earliest possible juncture.
I've replaced the code in Apache POI with a new implementation - https://svn.apache.org/viewvc?view=revision&revision=1893385
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
http://stackoverflow.com/questions/3925130/java-how-to-get-iteratorcharacter-from-string/3925162#3925162