Skip to content

Instantly share code, notes, and snippets.

@shakram02
Last active July 25, 2019 08:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shakram02/91b6b8929328b27eadcc7afa1b4b2815 to your computer and use it in GitHub Desktop.
Save shakram02/91b6b8929328b27eadcc7afa1b4b2815 to your computer and use it in GitHub Desktop.
Replaces Alef, Waw, Yeh variants with their original letters [plain Alef, plain Waw, plain Yeh] and removes tashkeel and ligatures
/**
* Author: @shakram02
* Replaces Alef, Waw, Yeh variants with their original letters [plain Alef, plain Waw, plain Yeh].
* Removes Arabic-related symbols and ligatures
*/
object ArabicNormalizer {
private val ALEF = "\u0627"
private val WAW = "\u0648"
private val YEH = "\u064A"
private val normalizationCache: HashMap<String, String> = hashMapOf()
// src: https://www.fileformat.info/info/charset/UTF-8/list.htm?start=1024
private val unicodeAlef = Regex("\\u0622|\\u0623|\\u0625|\\u0627|\\u0654|\\u0655|\\u065F|[\\u0670-\\u0675]")
private val unicodeWaw = Regex("[\\u0624\\u0676\\u0677]")
private val unicodeYeh = Regex("\\u0626|\\u0649|\\u064A|\\u0678")
private val specialUnicodeLigatures = Regex("\\u0616|\\u0617|")
private val specialUnicodeSmallHighLetters = Regex("\\u0617")
private val symbols = Regex("[\\u061b-\\u061f]|[\\u0657-\\u065e]|[\\u06D6-\\u06ED]|\\u0640|\\u200D")
private val tashkeel = Regex("[\\u0618-\\u061a]|[\\u064b-\\u0653]|[\\u0656-\\u065E]")
/**
* Normalize an input buffer of Arabic text
*
* @param input input buffer
* @return normalized string
*/
fun normalize(input: String): String {
if (normalizationCache.containsKey(input)) {
return normalizationCache[input]!!
}
val normalized = input
.replace(unicodeAlef, ALEF)
.replace(unicodeWaw, WAW)
.replace(unicodeYeh, YEH)
.replace(symbols, "")
.replace(tashkeel, "")
.replace(specialUnicodeLigatures, "")
.replace(specialUnicodeSmallHighLetters, "")
normalizationCache[input] = normalized
return normalized
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment