Created
July 6, 2015 18:41
-
-
Save jtbates/c9e4d30247539a722af7 to your computer and use it in GitHub Desktop.
rJava incorrect handling of UTF-16 surrogate pairs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rJava) | |
.jinit() | |
.jaddClassPath("UnicodePoints.jar") | |
unicode.points <- .jnew("UnicodePoints") | |
emoticon <- .jcall(unicode.points,"S","getAstonishedFace") | |
cat(paste(emoticon,"\n")) | |
charToRaw(emoticon) # ed a0 bd ed b8 b2 | |
tolower(emoticon) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rJava) | |
.jinit() | |
.jaddClassPath("UnicodePoints.jar") | |
unicode.points <- .jnew("UnicodePoints") | |
charset <- localeToCharset() | |
cat("\nLet's try to get the bug pictograph U+1F41B from Java...\n\n") | |
r.string1 <- .jcall(unicode.points,"S","getBug") | |
cat("This is how it displays:\n") | |
cat(paste(r.string1,"\n\n")) | |
cat("It should display as:\n") | |
cat("\U1F41B \n\n") | |
cat(paste("The encoding for the string we get from rJava is: ", | |
Encoding(r.string1),"\n\n")) | |
cat(paste("A string with \"unknown\" encoding should be treated as the local", | |
"charset: ",charset,"\n\n")) | |
cat("The raw encoding of the string we get from rJava is:\n") | |
print(charToRaw(r.string1)) # ed a0 bd ed b0 9b | |
cat("\nWe see that the surrogate pair is encoded as a pair of 3 byte \n") | |
cat("sequences and is thus ill-formed according to the UTF-8 standard: \n") | |
cat("http://unicode.org/faq/utf_bom.html#utf8-4 \n\n") | |
cat("String operations will result in an error: \n") | |
tryCatch({ | |
tolower(r.string1) | |
}, error = function(e) { | |
cat(paste(e,"\n\n")) | |
}) | |
cat("Java can properly convert from UTF-16 to UTF-8 to avoid this error.\n\n") | |
java.string2 <- .jcall(unicode.points,"S","getBug", | |
evalString=FALSE) | |
bytes2 <- .jcall(java.string2,"[B",method="getBytes",charset) | |
r.string2 <- rawToChar(bytes2) | |
cat("Using the getBytes String method with the UTF-8 charset gives us:\n") | |
print(charToRaw(r.string2)) # f0 9f 90 9b | |
cat("\nThis is the correct UTF-8 4 byte sequence.\n\n") | |
cat("It displays correctly:\n") | |
cat(paste0(r.string2,"\n\n")) # correct display | |
cat("And it can be used in string operations without error.\n") | |
tolower(r.string2) # executes without error |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Compile to jar file with: | |
// $ javac UnicodePoints.java | |
// $ jar cf UnicodePoints.jar UnicodePoints.class | |
class UnicodePoints { | |
public static String getAstonishedFace() { | |
return("\uD83D\uDE32"); | |
} | |
public static String getBug() { | |
return("\uD83D\uDC1B"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment