Skip to content

Instantly share code, notes, and snippets.

@swanhtet1992
Created May 13, 2015 03:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save swanhtet1992/5b5cc07edf6fce1c8be3 to your computer and use it in GitHub Desktop.
Save swanhtet1992/5b5cc07edf6fce1c8be3 to your computer and use it in GitHub Desktop.
Zawgyi <-> Unicode Converter written in Scala
// Zawgyi<>Unicode converter python module
// Based on rules from Parabaik Myanmar Text Converter Copyright (C) 2014 Ngwe Tun (Solveware Solution)
// Copyright (C) 2014 Swan Htet Aung
/**
*
* @author SH (github.com/swanhtet1992)
*/
class ParseParms(val help: String) {
private var parms = Map[String,(String,String,Boolean)]()
private var cache: Option[String] = None // save parm name across calls
// used by req and rex methods
def parm(name: String) = {
parms += name -> ("", "^.*$", false ) ;cache = Some(name)
this
}
def parm(name: String, default: String) = {
parms += name -> (default, defRex(default), false); cache = Some(name)
this
}
def parm(name: String, default: String, rex: String) = {
parms += name -> (default, rex, false); cache = Some(name)
this
}
def parm(name: String, default: String, rex: String, req: Boolean) = {
parms += name -> (default, rex, req); cache = Some(name)
this
}
def parm(name: String, default: String, req: Boolean) = {
parms += name -> (default, defRex(default), req); cache = Some(name)
this
}
def req(value: Boolean) = { // update required flag
val k = checkName // for current parameter name
if( k.length > 0 ) { // stored in cache
val pvalue = parms(k) // parmeter tuple value
val ntuple = (pvalue._1,pvalue._2,value) // new tuple
parms += cache.get -> ntuple // update entry in parms
} // .parm("-p1","1").req(true)
this // enables chained calls
}
def rex(value: String) = { // update regular-expression
val k = checkName // for current name
if( k.length > 0 ) { // stored in cache
val pvalue = parms(k) // parameter tuple value
val ntuple = (pvalue._1,value,pvalue._3) // new tuple
parms += cache.get -> ntuple // update tuple for key in parms
} // .parm("-p1","1").rex(".+")
this // enables chained calls
}
private def checkName = { // checks name stored in cache
cache match { // to be a parm-name used for
case Some(key) => key // req and rex methods
case _ => "" // req & rex will not update
} // entries if cache other than
} // Some(key)
private def defRex(default: String): String = {
if( default.matches("^\\d+$") ) "^\\d+$" else "^.*$"
}
private def genMap(args: List[String] ) = { // return a Map of args
var argsMap = Map[String,String]() // result object
if( ( args.length % 2 ) != 0 ) argsMap // must have pairs: -name value
else { // to return a valid Map
for( i <- 0.until(args.length,2) ){ // iterate through args by 2
argsMap += args(i) -> args(i+1) // add -name value pair
}
argsMap // return -name value Map
}
}
private def testRequired( args: Map[String,String] ) = {
val ParmsNotSupplied = new collection.mutable.ListBuffer[String]
for{ (key,value) <- parms // iterate trough parms
if value._3 // if parm is required
if !args.contains(key) // and it is not in args
} ParmsNotSupplied += key // add it to List
ParmsNotSupplied.toList // empty: all required present
}
private def validParms( args: Map[String,String] ) = {
val invalidParms = new collection.mutable.ListBuffer[String]
for{ (key,value) <- args // iterate through args
if parms.contains(key) // if it is a defined parm
rex = parms(key)._2 // parm defined rex
if !value.matches(rex) // if regex does not match
} invalidParms += key // add invalid arg
invalidParms.toList // empty: all parms valid
}
private def mergeParms( args: Map[String,String] ) = {
//val mergedMap = collection.mutable.Map[String,String]()
var mergedMap = Map[String,String]() // name value Map of results
for{ (key,value) <- parms // iterate through parms
//mValue = if( args.contains(key) ) args(key) else value(0)
mValue = args.getOrElse(key,value._1) // args(key) or default
} mergedMap += key -> mValue // update result Map
mergedMap // return mergedMap
}
private def mkString(l1: List[String],l2: List[String]) = {
"\nhelp: " + help + "\n\trequired parms missing: " +
( if( !l1.isEmpty ) l1.mkString(" ") else "" ) +
( if( !l2.isEmpty ) "\n\tinvalid parms: " +
l2.mkString(" ") + "\n" else "" )
}
def validate( args: List[String] ) = { // validate args to parms
val argsMap = genMap( args ) // Map of args: -name value
val reqList = testRequired( argsMap ) // List of missing required
val validList = validParms( argsMap ) // List of (in)valid args
if( reqList.isEmpty && validList.isEmpty ) {// successful return
(true,"",mergeParms( argsMap )) // true, "", mergedParms
} else (false,mkString(reqList,validList),Map[String,String]())
}
}
object Converter {
def main(args: Array[String]) = {
val helpString = " -to zg/uni -path file.txt"
val pp = new ParseParms(helpString)
pp.parm("-to", "zg").req(true)
.parm("-path", "file.txt").rex("^.*\\.txt$").req(true)
val result = pp.validate(args.toList)
if (result._1) {
val options = result._3.values
convertFile(options.head, options.last)
} else {
println(result._2)
}
}
def convertFile(to: String, path: String) {
import tools.nsc.io.File
import io.Source
val text = Source.fromFile(path).mkString
to match {
case "zg" => File("./zawgyi.txt").writeAll(uni512zg1(text))
case "uni" => File("./unicode.txt").writeAll(zg12uni51(text))
case _ => println("Please give me valid options")
}
}
def uni512zg1(input: String) = {
var outputText = input
outputText = outputText.replaceAll("\\u104e\\u1004\\u103a\\u1038", "\u104e")
outputText = outputText.replaceAll("\\u102b\\u103a", "\u105a")
outputText = outputText.replaceAll("\\u102d\\u1036", "\u108e")
outputText = outputText.replaceAll("\\u103f", "\u1086")
outputText = outputText.replaceAll("(?<=\\u102f)\\u1037", "\u1094")
outputText = outputText.replaceAll("(?<=\\u102f\\u1036)\\u1037", "\u1094")
outputText = outputText.replaceAll("(?<=\\u1030)\\u1037", "\u1094")
outputText = outputText.replaceAll("(?<=\\u1030\\u1036)\\u1037", "\u1094")
outputText = outputText.replaceAll("(?<=\\u1014)\\u1037", "\u1094")
outputText = outputText.replaceAll("(?<=\\u1014[\\u103a\\u1032])\\u1037", "\u1094")
outputText = outputText.replaceAll("(?<=\\u103b)\\u1037", "\u1095")
outputText = outputText.replaceAll("(?<=\\u103b[\\u1032\\u1036])\\u1037", "\u1095")
outputText = outputText.replaceAll("(?<=\\u103d)\\u1037", "\u1095")
outputText = outputText.replaceAll("(?<=\\u103d[\\u1032])\\u1037", "\u1095")
outputText = outputText.replaceAll("(?<=[\\u103b\\u103c\\u103d])\\u102f", "\u1033")
outputText = outputText.replaceAll("(?<=[\\u103b\\u103c\\u103d][\\u102d\\u1036])\\u102f", "\u1033")
outputText = outputText.replaceAll("(?<=(\\u1039[\\u1000-\\u1021]))\\u102f", "\u1033")
outputText = outputText.replaceAll("(?<=(\\u1039[\\u1000-\\u1021])[\\u102d\\u1036])\\u102f", "\u1033")
outputText = outputText.replaceAll("(?<=[\\u100a\\u100c\\u1020\\u1025\\u1029])\\u102f", "\u1033")
outputText = outputText.replaceAll("(?<=[\\u100a\\u100c\\u1020\\u1025\\u1029][\\u102d\\u1036])\\u102f", "\u1033")
outputText = outputText.replaceAll("(?<=[\\u103b\\u103c])\\u1030", "\u1034")
outputText = outputText.replaceAll("(?<=[\\u103b\\u103c][\\u103d])\\u1030", "\u1034")
outputText = outputText.replaceAll("(?<=[\\u103b\\u103c][\\u103e])\\u1030", "\u1034")
outputText = outputText.replaceAll("(?<=[\\u103b\\u103c][\\u102d\\u1036])\\u1030", "\u1034")
outputText = outputText.replaceAll("(?<=[\\u103b\\u103c][\\u103d][\\u103e])\\u1030", "\u1034")
outputText = outputText.replaceAll("(?<=[\\u103b\\u103c][\\u103d][\\u102d\\u1036])\\u1030", "\u1034")
outputText = outputText.replaceAll("(?<=[\\u103b\\u103c][\\u103e][\\u102d\\u1036])\\u1030", "\u1034")
outputText = outputText.replaceAll("(?<=[\\u103b\\u103c][\\u103d][\\u103e][\\u102d\\u1036])\\u1030", "\u1034")
outputText = outputText.replaceAll("(?<=(\\u1039[\\u1000-\\u1021]))\\u1030", "\u1034")
outputText = outputText.replaceAll("(?<=(\\u1039[\\u1000-\\u1021])[\\u102d\\u1036])\\u1030", "\u1034")
outputText = outputText.replaceAll("(?<=[\\u100a\\u100c\\u1020\\u1025\\u1029])\\u1030", "\u1034")
outputText = outputText.replaceAll("(?<=[\\u100a\\u100c\\u1020\\u1025\\u1029][\\u102d\\u1036])\\u1030", "\u1034")
outputText = outputText.replaceAll("(?<=\\u103c)\\u103e", "\u1087")
outputText = outputText.replaceAll("\\u1009(?=[\\u103a])", "\u1025")
outputText = outputText.replaceAll("\\u1009(?=\\u1039[\\u1000-\\u1021])", "\u1025")
outputText = outputText.replaceAll("([\\u1000-\\u1021\\u1029])((?:\\u1039[\\u1000-\\u1021])?)((?:[\\u103b-\\u103e\\u1087]*)?)\\u1031", "\u1031$1$2$3")
outputText = outputText.replaceAll("([\\u1000-\\u1021\\u1029])((?:\\u1039[\\u1000-\\u1021\\u1000-\\u1021])?)(\\u103c)", "$3$1$2")
outputText = outputText.replaceAll("\\u1004\\u103a\\u1039", "\u1064")
outputText = outputText.replaceAll("(\\u1064)((?:\\u1031)?)((?:\\u103c)?)([\\u1000-\\u1021])\\u102d", "$2$3$4\u108b")
outputText = outputText.replaceAll("(\\u1064)((?:\\u1031)?)((?:\\u103c)?)([\\u1000-\\u1021])\\u102e", "$2$3$4\u108c")
outputText = outputText.replaceAll("(\\u1064)((?:\\u1031)?)((?:\\u103c)?)([\\u1000-\\u1021])\\u1036", "$2$3$4\u108d")
outputText = outputText.replaceAll("(\\u1064)((?:\\u1031)?)((?:\\u103c)?)([\\u1000-\\u1021])", "$2$3$4\u1064")
outputText = outputText.replaceAll("\\u100a(?=[\\u1039\\u102f\\u1030])", "\u106b")
outputText = outputText.replaceAll("\\u100a", "\u100a")
outputText = outputText.replaceAll("\\u101b(?=[\\u102f\\u1030])", "\u1090")
outputText = outputText.replaceAll("\\u101b", "\u101b")
outputText = outputText.replaceAll("\\u1014(?=[\\u1039\\u103d\\u103e\\u102f\\u1030])", "\u108f")
outputText = outputText.replaceAll("\\u1014", "\u1014")
outputText = outputText.replaceAll("\\u1039\\u1000", "\u1060")
outputText = outputText.replaceAll("\\u1039\\u1001", "\u1061")
outputText = outputText.replaceAll("\\u1039\\u1002", "\u1062")
outputText = outputText.replaceAll("\\u1039\\u1003", "\u1063")
outputText = outputText.replaceAll("\\u1039\\u1005", "\u1065")
outputText = outputText.replaceAll("\\u1039\\u1006", "\u1066")
outputText = outputText.replaceAll("(?<=[\\u1001\\u1002\\u1004\\u1005\\u1007\\u1012\\u1013\\u108f\\u1015\\u1016\\u1017\\u1019\\u101d])\\u1066", "\u1067")
outputText = outputText.replaceAll("\\u1039\\u1007", "\u1068")
outputText = outputText.replaceAll("\\u1039\\u1008", "\u1069")
outputText = outputText.replaceAll("\\u1039\\u100f", "\u1070")
outputText = outputText.replaceAll("\\u1039\\u1010", "\u1071")
outputText = outputText.replaceAll("(?<=[\\u1001\\u1002\\u1004\\u1005\\u1007\\u1012\\u1013\\u108f\\u1015\\u1016\\u1017\\u1019\\u101d])\\u1071", "\u1072")
outputText = outputText.replaceAll("\\u1039\\u1011", "\u1073")
outputText = outputText.replaceAll("(?<=[\\u1001\\u1002\\u1004\\u1005\\u1007\\u1012\\u1013\\u108f\\u1015\\u1016\\u1017\\u1019\\u101d])\\u1073", "\u1074")
outputText = outputText.replaceAll("\\u1039\\u1012", "\u1075")
outputText = outputText.replaceAll("\\u1039\\u1013", "\u1076")
outputText = outputText.replaceAll("\\u1039\\u1014", "\u1077")
outputText = outputText.replaceAll("\\u1039\\u1015", "\u1078")
outputText = outputText.replaceAll("\\u1039\\u1016", "\u1079")
outputText = outputText.replaceAll("\\u1039\\u1017", "\u107a")
outputText = outputText.replaceAll("\\u1039\\u1018", "\u107b")
outputText = outputText.replaceAll("\\u1039\\u1019", "\u107c")
outputText = outputText.replaceAll("\\u1039\\u101c", "\u1085")
outputText = outputText.replaceAll("\\u100f\\u1039\\u100d", "\u1091")
outputText = outputText.replaceAll("\\u100b\\u1039\\u100c", "\u1092")
outputText = outputText.replaceAll("\\u1039\\u100c", "\u106d")
outputText = outputText.replaceAll("\\u100b\\u1039\\u100b", "\u1097")
outputText = outputText.replaceAll("\\u1039\\u100b", "\u106c")
outputText = outputText.replaceAll("\\u100e\\u1039\\u100d", "\u106f")
outputText = outputText.replaceAll("\\u100d\\u1039\\u100d", "\u106e")
outputText = outputText.replaceAll("\\u1009(?=\\u103a)", "\u1025")
outputText = outputText.replaceAll("\\u1025(?=[\\u1039\\u102f\\u1030])", "\u106a")
outputText = outputText.replaceAll("\\u1025", "\u1025")
outputText = outputText.replaceAll("\\u103a", "\u1039")
outputText = outputText.replaceAll("\\u103b\\u103d\\u103e", "\u107d\u108a")
outputText = outputText.replaceAll("\\u103d\\u103e", "\u108a")
outputText = outputText.replaceAll("\\u103b", "\u103a")
outputText = outputText.replaceAll("\\u103c", "\u103b")
outputText = outputText.replaceAll("\\u103d", "\u103c")
outputText = outputText.replaceAll("\\u103e", "\u103d")
outputText = outputText.replaceAll("\\u103a(?=[\\u103c\\u103d\\u108a])", "\u107d")
outputText = outputText.replaceAll("(?<=\\u100a(?:[\\u102d\\u102e\\u1036\\u108b\\u108c\\u108d\\u108e]))\\u103d", "\u1087")
outputText = outputText.replaceAll("(?<=\\u100a)\\u103d", "\u1087")
outputText = outputText.replaceAll("\\u103b(?=[\\u1000\\u1003\\u1006\\u100f\\u1010\\u1011\\u1018\\u101a\\u101c\\u101e\\u101f\\u1021])", "\u107e")
outputText = outputText.replaceAll("\\u107e([\\u1000-\\u1021\\u108f])(?=[\\u102d\\u102e\\u1036\\u108b\\u108c\\u108d\\u108e])", "\u1080$1")
outputText = outputText.replaceAll("\\u107e([\\u1000-\\u1021\\u108f])(?=[\\u103c\\u108a])", "\u1082$1")
outputText = outputText.replaceAll("\\u103b([\\u1000-\\u1021\\u108f])(?=[\\u102d\\u102e\\u1036\\u108b\\u108c\\u108d\\u108e])", "\u107f$1")
outputText = outputText.replaceAll("\\u103b([\\u1000-\\u1021\\u108f])(?=[\\u103c\\u108a])", "\u1081$1")
outputText = outputText.replaceAll("(?<=\\u1014)\\u1037", "\u1094")
outputText = outputText.replaceAll("(?<=\\u1014[\\u103a\\u1032])\\u1037", "\u1094")
outputText = outputText.replaceAll("(?<=\\u1033)\\u1094", "\u1095")
outputText = outputText.replaceAll("(?<=\\u1033[\\u1036])\\u1094", "\u1095")
outputText = outputText.replaceAll("(?<=\\u1034)\\u1094", "\u1095")
outputText = outputText.replaceAll("(?<=\\u1034[\\u1036])\\u1094", "\u1095")
outputText = outputText.replaceAll("(?<=[\\u103c\\u103d\\u108a])\\u1037", "\u1095")
outputText = outputText.replaceAll("(?<=[\\u103c\\u103d\\u108a][\\u1032])\\u1037", "\u1095")
outputText
}
def zg12uni51(input: String) = {
var outputText = input
outputText = outputText.replaceAll("\\u106a", "\u1009")
outputText = outputText.replaceAll("\\u1025(?=[\\u1039\\u102c])", "\u1009")
outputText = outputText.replaceAll("\\u1025\\u102e", "\u1026")
outputText = outputText.replaceAll("\\u106b", "\u100a")
outputText = outputText.replaceAll("\\u1090", "\u101b")
outputText = outputText.replaceAll("\\u1040", "\u1040")
outputText = outputText.replaceAll("\\u108f", "\u1014")
outputText = outputText.replaceAll("\\u1012", "\u1012")
outputText = outputText.replaceAll("\\u1013", "\u1013")
outputText = outputText.replaceAll("[\\u103d\\u1087]", "\u103e")
outputText = outputText.replaceAll("\\u103c", "\u103d")
outputText = outputText.replaceAll("[\\u103b\\u107e\\u107f\\u1080\\u1081\\u1082\\u1083\\u1084]", "\u103c")
outputText = outputText.replaceAll("[\\u103a\\u107d]", "\u103b")
outputText = outputText.replaceAll("\\u103d\\u103b", "\u103b\u103d")
outputText = outputText.replaceAll("\\u108a", "\u103d\u103d")
outputText = outputText.replaceAll("\\u103d\\u103d", "\u103d\u103d")
outputText = outputText.replaceAll("((?:\\u1031)?)((?:\\u103c)?)([\\u1000-\\u1021])\\u1064", "\u1064$1$2$3")
outputText = outputText.replaceAll("((?:\\u1031)?)((?:\\u103c)?)([\\u1000-\\u1021])\\u108b", "\u1064$1$2$3\u102d")
outputText = outputText.replaceAll("((?:\\u1031)?)((?:\\u103c)?)([\\u1000-\\u1021])\\u108c", "\u1064$1$2$3\u102e")
outputText = outputText.replaceAll("((?:\\u1031)?)((?:\\u103c)?)([\\u1000-\\u1021])\\u108d", "\u1064$1$2$3\u1036")
outputText = outputText.replaceAll("\\u105a", "\u102b\u103a")
outputText = outputText.replaceAll("\\u108e", "\u102d\u1036")
outputText = outputText.replaceAll("\\u1033", "\u102f")
outputText = outputText.replaceAll("\\u1034", "\u1030")
outputText = outputText.replaceAll("\\u1088", "\u103d\u102f")
outputText = outputText.replaceAll("\\u1089", "\u103d\u1030")
outputText = outputText.replaceAll("\\u1039", "\u103a")
outputText = outputText.replaceAll("[\\u1094\\u1095]", "\u1037")
outputText = outputText.replaceAll("([\\u1000-\\u1021])([\\u102c\\u102d\\u102e\\u1032\\u1036]){1,2}([\\u1060\\u1061\\u1062\\u1063\\u1065\\u1066\\u1067\\u1068\\u1069\\u1070\\u1071\\u1072\\u1073\\u1074\\u1075\\u1076\\u1077\\u1078\\u1079\\u107a\\u107b\\u107c\\u1085])", "$1$3$2")
outputText = outputText.replaceAll("\\u1064", "\u1004\u103a\u1039")
outputText = outputText.replaceAll("\\u104e", "\u104e\u1004\u103a\u1038")
outputText = outputText.replaceAll("\\u1086", "\u103f")
outputText = outputText.replaceAll("\\u1060", "\u1039\u1000")
outputText = outputText.replaceAll("\\u1061", "\u1039\u1001")
outputText = outputText.replaceAll("\\u1062", "\u1039\u1002")
outputText = outputText.replaceAll("\\u1063", "\u1039\u1003")
outputText = outputText.replaceAll("\\u1065", "\u1039\u1005")
outputText = outputText.replaceAll("[\\u1066\\u1067]", "\u1039\u1006")
outputText = outputText.replaceAll("\\u1068", "\u1039\u1007")
outputText = outputText.replaceAll("\\u1069", "\u1039\u1008")
outputText = outputText.replaceAll("\\u106c", "\u1039\u100b")
outputText = outputText.replaceAll("\\u1070", "\u1039\u100f")
outputText = outputText.replaceAll("[\\u1071\\u1072]", "\u1039\u1010")
outputText = outputText.replaceAll("[\\u1073\\u1074]", "\u1039\u1011")
outputText = outputText.replaceAll("\\u1075", "\u1039\u1012")
outputText = outputText.replaceAll("\\u1076", "\u1039\u1013")
outputText = outputText.replaceAll("\\u1077", "\u1039\u1014")
outputText = outputText.replaceAll("\\u1078", "\u1039\u1015")
outputText = outputText.replaceAll("\\u1079", "\u1039\u1016")
outputText = outputText.replaceAll("\\u107a", "\u1039\u1017")
outputText = outputText.replaceAll("\\u107b", "\u1039\u1018")
outputText = outputText.replaceAll("\\u107c", "\u1039\u1019")
outputText = outputText.replaceAll("\\u1085", "\u1039\u101c")
outputText = outputText.replaceAll("\\u106d", "\u1039\u100c")
outputText = outputText.replaceAll("\\u1091", "\u100f\u1039\u100d")
outputText = outputText.replaceAll("\\u1092", "\u100b\u1039\u100c")
outputText = outputText.replaceAll("\\u1097", "\u100b\u1039\u100b")
outputText = outputText.replaceAll("\\u106f", "\u100e\u1039\u100d")
outputText = outputText.replaceAll("\\u106e", "\u100d\u1039\u100d")
outputText = outputText.replaceAll("(\\u103c)([\\u1000-\\u1021])((?:\\u1039[\\u1000-\\u1021])?)", "$2$3$1")
outputText = outputText.replaceAll("(\\u103d)(\\u103d)([\\u103b\\u103c])", "$3$2$1")
outputText = outputText.replaceAll("(\\u103d)([\\u103b\\u103c])", "$2$1")
outputText = outputText.replaceAll("(\\u103d)([\\u103b\\u103c])", "$2$1")
outputText = outputText.replaceAll("(?<=([\\u1000-\\u101c\\u101e-\\u102a\\u102c\\u102e-\\u103d\\u104c-\\u109f]))(\\u1040)(?=\\s)?", "\u101d")
outputText = outputText.replaceAll("(?<=(\\u101d))(\\u1040)(?=\\s)?", "\u101d")
outputText = outputText.replaceAll("(?<=([\\u1000-\\u101c\\u101e-\\u102a\\u102c\\u102e-\\u103d\\u104c-\\u109f\\s]))(\\u1047)", "\u101b")
outputText = outputText.replaceAll("(\\u1047)(?=[\\u1000-\\u101c\\u101e-\\u102a\\u102c\\u102e-\\u103d\\u104c-\\u109f\\s])", "\u101b")
outputText = outputText.replaceAll("((?:\\u1031)?)([\\u1000-\\u1021])((?:\\u1039[\\u1000-\\u1021])?)((?:[\\u102d\\u102e\\u1032])?)([\\u1036\\u1037\\u1038]{0,2})([\\u103b-\\u103d]{0,3})((?:[\\u102f\\u1030])?)([\\u1036\\u1037\\u1038]{0,2})((?:[\\u102d\\u102e\\u1032])?)", "$2$3$6$1$4$9$7$5$8")
outputText = outputText.replaceAll("\\u1036\\u102f", "\u102f\u1036")
outputText = outputText.replaceAll("(\\u103a)(\\u1037)", "$2$1")
outputText
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment