jtbates/UnicodePoints.java

## rJavaSurrogatePairs-MRE.R
library(rJava)
.jinit()
.jaddClassPath("UnicodePoints.jar")
unicode.points <- .jnew("UnicodePoints")

emoticon <- .jcall(unicode.points,"S","getAstonishedFace")
cat(paste(emoticon,"\n"))
charToRaw(emoticon) # ed a0 bd ed b8 b2
tolower(emoticon)

## rJavaSurrogatePairs.R
library(rJava)
.jinit()
.jaddClassPath("UnicodePoints.jar")
unicode.points <- .jnew("UnicodePoints")
charset <- localeToCharset()

cat("\nLet's try to get the bug pictograph U+1F41B from Java...\n\n")
r.string1 <- .jcall(unicode.points,"S","getBug")

cat("This is how it displays:\n")
cat(paste(r.string1,"\n\n"))

cat("It should display as:\n")
cat("\U1F41B \n\n")

cat(paste("The encoding for the string we get from rJava is: ",
          Encoding(r.string1),"\n\n"))

cat(paste("A string with \"unknown\" encoding should be treated as the local",
          "charset: ",charset,"\n\n"))

cat("The raw encoding of the string we get from rJava is:\n")
print(charToRaw(r.string1)) # ed a0 bd ed b0 9b

cat("\nWe see that the surrogate pair is encoded as a pair of 3 byte \n")
cat("sequences and is thus ill-formed according to the UTF-8 standard: \n")
cat("http://unicode.org/faq/utf_bom.html#utf8-4 \n\n")

cat("String operations will result in an error: \n")
tryCatch({
  tolower(r.string1)
}, error = function(e) {
  cat(paste(e,"\n\n"))
})

cat("Java can properly convert from UTF-16 to UTF-8 to avoid this error.\n\n")
java.string2 <- .jcall(unicode.points,"S","getBug",
                      evalString=FALSE)
bytes2 <- .jcall(java.string2,"[B",method="getBytes",charset)
r.string2 <- rawToChar(bytes2)
cat("Using the getBytes String method with the UTF-8 charset gives us:\n")
print(charToRaw(r.string2)) # f0 9f 90 9b
cat("\nThis is the correct UTF-8 4 byte sequence.\n\n")
cat("It displays correctly:\n")
cat(paste0(r.string2,"\n\n")) # correct display

cat("And it can be used in string operations without error.\n")
tolower(r.string2) # executes without error

## UnicodePoints.java
// Compile to jar file with:
// $ javac UnicodePoints.java
// $ jar cf UnicodePoints.jar UnicodePoints.class
class UnicodePoints {
  public static String getAstonishedFace() {
    return("\uD83D\uDE32");
  }
  public static String getBug() {
      return("\uD83D\uDC1B");
  }
}
	library(rJava)
	.jinit()
	.jaddClassPath("UnicodePoints.jar")
	unicode.points <- .jnew("UnicodePoints")

	emoticon <- .jcall(unicode.points,"S","getAstonishedFace")
	cat(paste(emoticon,"\n"))
	charToRaw(emoticon) # ed a0 bd ed b8 b2
	tolower(emoticon)
	// Compile to jar file with:
	// $ javac UnicodePoints.java
	// $ jar cf UnicodePoints.jar UnicodePoints.class
	class UnicodePoints {
	public static String getAstonishedFace() {
	return("\uD83D\uDE32");
	}
	public static String getBug() {
	return("\uD83D\uDC1B");
	}
	}