Skip to content

Instantly share code, notes, and snippets.

@jtbates
Created July 6, 2015 18:41
Show Gist options
  • Save jtbates/c9e4d30247539a722af7 to your computer and use it in GitHub Desktop.
Save jtbates/c9e4d30247539a722af7 to your computer and use it in GitHub Desktop.
rJava incorrect handling of UTF-16 surrogate pairs
library(rJava)
.jinit()
.jaddClassPath("UnicodePoints.jar")
unicode.points <- .jnew("UnicodePoints")
emoticon <- .jcall(unicode.points,"S","getAstonishedFace")
cat(paste(emoticon,"\n"))
charToRaw(emoticon) # ed a0 bd ed b8 b2
tolower(emoticon)
library(rJava)
.jinit()
.jaddClassPath("UnicodePoints.jar")
unicode.points <- .jnew("UnicodePoints")
charset <- localeToCharset()
cat("\nLet's try to get the bug pictograph U+1F41B from Java...\n\n")
r.string1 <- .jcall(unicode.points,"S","getBug")
cat("This is how it displays:\n")
cat(paste(r.string1,"\n\n"))
cat("It should display as:\n")
cat("\U1F41B \n\n")
cat(paste("The encoding for the string we get from rJava is: ",
Encoding(r.string1),"\n\n"))
cat(paste("A string with \"unknown\" encoding should be treated as the local",
"charset: ",charset,"\n\n"))
cat("The raw encoding of the string we get from rJava is:\n")
print(charToRaw(r.string1)) # ed a0 bd ed b0 9b
cat("\nWe see that the surrogate pair is encoded as a pair of 3 byte \n")
cat("sequences and is thus ill-formed according to the UTF-8 standard: \n")
cat("http://unicode.org/faq/utf_bom.html#utf8-4 \n\n")
cat("String operations will result in an error: \n")
tryCatch({
tolower(r.string1)
}, error = function(e) {
cat(paste(e,"\n\n"))
})
cat("Java can properly convert from UTF-16 to UTF-8 to avoid this error.\n\n")
java.string2 <- .jcall(unicode.points,"S","getBug",
evalString=FALSE)
bytes2 <- .jcall(java.string2,"[B",method="getBytes",charset)
r.string2 <- rawToChar(bytes2)
cat("Using the getBytes String method with the UTF-8 charset gives us:\n")
print(charToRaw(r.string2)) # f0 9f 90 9b
cat("\nThis is the correct UTF-8 4 byte sequence.\n\n")
cat("It displays correctly:\n")
cat(paste0(r.string2,"\n\n")) # correct display
cat("And it can be used in string operations without error.\n")
tolower(r.string2) # executes without error
// Compile to jar file with:
// $ javac UnicodePoints.java
// $ jar cf UnicodePoints.jar UnicodePoints.class
class UnicodePoints {
public static String getAstonishedFace() {
return("\uD83D\uDE32");
}
public static String getBug() {
return("\uD83D\uDC1B");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment