Skip to content

Instantly share code, notes, and snippets.

@aembleton
Last active August 29, 2015 14:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aembleton/c3495438eac9ea19fff6 to your computer and use it in GitHub Desktop.
Save aembleton/c3495438eac9ea19fff6 to your computer and use it in GitHub Desktop.
Translates between ASCII and unicode.
import scala.annotation.tailrec
object UnicodeTranslator {
private lazy val unicodeRegex = """(\\u[0-9a-fA-F]{4})""".r
def escapeUnicode(stringToEscape:String) = stringToEscape.foldLeft("") { (escaped, char) =>
if (char <= 127) {
escaped + char
} else {
escaped + charToHex(char)
}
}
def unescapeUnicode(stringToRemoveEscapesFrom:String):String = unescapeUnicode(stringToRemoveEscapesFrom, "")
@tailrec
private def unescapeUnicode(remainder:String, unescaped:String):String = remainder.take(6) match {
case "" => unescaped;
case unicodeRegex(hex) => unescapeUnicode(remainder.drop(6), unescaped + hexToChar(hex))
case str => unescapeUnicode(remainder.tail, unescaped + str.head)
}
private def charToHex(c: Char) = {
val hex = Integer.toHexString(c).toUpperCase.reverse.padTo(4, '0').reverse
s"\\u$hex"
}
private def hexToChar(hex: String) = {
val unicode = hex.drop(2)
Integer.parseInt(unicode, 16).toChar
}
}
import org.junit.runner.RunWith
import org.specs2.mutable.Specification
import org.specs2.runner.JUnitRunner
@RunWith(classOf[JUnitRunner])
class UnicodeTranslatorTest extends Specification {
"escaping unicode" should {
"not make any modifications for ASCII strings" in {
UnicodeTranslator.escapeUnicode("Hello World") mustEqual "Hello World"
}
"replace a single character with it's unicode equivalent" in {
UnicodeTranslator.escapeUnicode("ò") mustEqual "\\u00F2"
}
"replace the accented characters in a String" in {
UnicodeTranslator.escapeUnicode("Bòrd na Gàidhlig") mustEqual "B\\u00F2rd na G\\u00E0idhlig"
}
}
"unescaping unicode" should {
"not make any modifications for ASCII strings" in {
UnicodeTranslator.unescapeUnicode("Hello World") mustEqual "Hello World"
}
"replace a unicode with it's equivalent character" in {
UnicodeTranslator.unescapeUnicode("\\u00F2") mustEqual "ò"
}
"replace unicodes in a string with the equivalent accented characters" in {
UnicodeTranslator.unescapeUnicode("B\\u00F2rd na G\\u00E0idhlig") mustEqual "Bòrd na Gàidhlig"
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment