Zawgyi detector
package android.widget; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
import android.text.TextUtils; | |
/** | |
* | |
* @author SH | |
*/ | |
public class ZgDetector { | |
private static final Pattern ZAWGYI_DETECT_PATTERN = Pattern.compile( | |
// A regular expression matched if text is Zawgyi encoding. | |
// Using the ranges 1033-1034 or 1060-1097 will report Shan, Karen, | |
// etc. as Zawgyi. | |
"[\u105a\u1060-\u1097]|" // Zawgyi characters outside Unicode range | |
+ "[\u1033\u1034]|" // These are Mon characters | |
+ "\u1031\u108f|" | |
+ "\u1031[\u103b-\u103e]|" // Medial right after \u1031 | |
+ "[\u102b-\u1030\u1032]\u1031|" // Vowel sign right after before \u1031 | |
+ " \u1031| \u103b|" // Unexpected characters after a space | |
+ "^\u1031|^\u103b|\u1038\u103b|\u1038\u1031|" | |
+ "[\u102d\u102e\u1032]\u103b|\u1039[^\u1000-\u1021]|\u1039$" | |
+ "|\u1004\u1039[\u1001-\u102a\u103f\u104e]" // Missing ASAT in Kinzi | |
+ "|\u1039[^u1000-\u102a\u103f\u104e]" // 1039 not before a consonant | |
// Out of order medials | |
+ "|\u103c\u103b|\u103d\u103b" | |
+ "|\u103e\u103b|\u103d\u103c" | |
+ "|\u103e\u103c|\u103e\u103d" | |
// Bad medial combos | |
+ "|\u103b\u103c" | |
// Out of order vowel signs | |
+ "|[\u102f\u1030\u102b\u102c][\u102d\u102e\u1032]" | |
+ "|[\u102b\u102c][\u102f\u102c]" | |
// Digit before diacritic | |
+ "|[\u1040-\u1049][\u102b-\u103e\u102b-\u1030\u1032\u1036\u1037\u1038\u103a]" | |
// Single digit 0, 7 at start | |
+ "|^[\u1040\u1047][^\u1040-\u1049]" | |
// Second 1039 with bad followers | |
+ "|[\u1000-\u102a\u103f\u104e]\u1039[\u101a\u101b\u101d\u101f\u1022-\u103f]" | |
// Other bad combos. | |
+ "|\u103a\u103e" | |
+ "|\u1036\u102b]" | |
// multiple upper vowels | |
+ "|\u102d[\u102e\u1032]|\u102e[\u102d\u1032]|\u1032[\u102d\u102e]" | |
// Multiple lower vowels | |
+ "|\u102f\u1030|\u1030\u102f" | |
// Multiple A vowels | |
+ "|\u102b\u102c|\u102c\u102b" | |
// Shan digits with vowels or medials or other signs | |
+ "|[\u1090-\u1099][\u102b-\u1030\u1032\u1037\u103a-\u103e]" | |
// Isolated Shan digit | |
+ "|[\u1000-\u10f4][\u1090-\u1099][\u1000-\u104f]" | |
+ "|^[\u1090-\u1099][\u1000-\u102a\u103f\u104e\u104a\u104b]" | |
+ "|[\u1000-\u104f][\u1090-\u1099]$" | |
// Diacritics with non-Burmese vowel signs | |
+ "|[\u105e-\u1060\u1062-\u1064\u1067-\u106d\u1071-\u1074\u1082-\u108d" | |
+ "\u108f\u109a-\u109d]" | |
+ "[\u102b-\u103e]" | |
// Consonant 103a + some vowel signs | |
+ "|[\u1000-\u102a]\u103a[\u102d\u102e\u1032]" | |
// 1031 after other vowel signs | |
+ "|[\u102b-\u1030\u1032\u1036-\u1038\u103a]\u1031" | |
// Using Shan combining characters with other languages. | |
+ "|[\u1087-\u108d][\u106e-\u1070\u1072-\u1074]" | |
// Non-Burmese diacritics at start, following space, or following sections | |
+ "|^[\u105e-\u1060\u1062-\u1064\u1067-\u106d\u1071-\u1074" | |
+ "\u1082-\u108d\u108f\u109a-\u109d]" | |
+ "|[\u0020\u104a\u104b][\u105e-\u1060\u1062-\u1064\u1067-\u106d" | |
+ "\u1071-\u1074\u1082-\u108d\u108f\u109a-\u109d]" | |
// Wrong order with 1036 | |
+ "|[\u1036\u103a][\u102d-\u1030\u1032]" | |
// Odd stacking | |
+ "|[\u1025\u100a]\u1039" | |
// More mixing of non-Burmese languages | |
+ "|[\u108e-\u108f][\u1050-\u108d]" | |
// Bad diacritic combos. | |
+ "|\u102d-\u1030\u1032\u1036-\u1037]\u1039]" | |
// Dot before subscripted consonant | |
+ "|[\u1000-\u102a\u103f\u104e]\u1037\u1039" | |
// Odd subscript + vowel signs | |
+ "|[\u1000-\u102a\u103f\u104e]\u102c\u1039[\u1000-\u102a\u103f\u104e]" | |
// Medials after vowels | |
+ "|[\u102b-\u1030\u1032][\u103b-\u103e]" | |
// Medials | |
+ "|\u1032[\u103b-\u103e]" | |
// Medial with 101b | |
+ "|\u101b\u103c" | |
// Stacking too deeply: consonant 1039 consonant 1039 consonant | |
+ "|[\u1000-\u102a\u103f\u104e]\u1039[\u1000-\u102a\u103f\u104e]\u1039" | |
+ "[\u1000-\u102a\u103f\u104e]" | |
// Stacking pattern consonant 1039 consonant 103a other vowel signs | |
+ "|[\u1000-\u102a\u103f\u104e]\u1039[\u1000-\u102a\u103f\u104e]" | |
+ "[\u102b\u1032\u103d]" | |
// Odd stacking over u1021, u1019, and u1000 | |
+ "|[\u1000\u1005\u100f\u1010\u1012\u1014\u1015\u1019\u101a]\u1039\u1021" | |
+ "|[\u1000\u1010]\u1039\u1019" | |
+ "|\u1004\u1039\u1000" | |
+ "|\u1015\u1039[\u101a\u101e]" | |
+ "|\u1000\u1039\u1001\u1036" | |
+ "|\u1039\u1011\u1032" | |
// Vowel sign in wrong order | |
+ "|\u1037\u1032" | |
+ "|\u1036\u103b" | |
// Duplicated vowel | |
+ "|\u102f\u102f" | |
); | |
public static CharSequence hint(CharSequence input){ | |
return zg2uni(input, true); // true means not append the original text | |
} | |
public static CharSequence text(CharSequence input) { | |
return zg2uni(input, false); // false means will append the original text | |
} | |
public static CharSequence zg2uni(CharSequence input, boolean notAppend) { | |
if(input == null) | |
return input; | |
Matcher matcher = ZAWGYI_DETECT_PATTERN.matcher(input); | |
if(matcher.find()){ | |
String output = input.toString(); | |
output = output.replaceAll("\\u106a", "\u1009"); | |
output = output.replaceAll("\\u1025(?=[\\u1039\\u102c])", "\u1009"); | |
output = output.replaceAll("\\u1025\\u102e", "\u1026"); | |
output = output.replaceAll("\\u106b", "\u100a"); | |
output = output.replaceAll("\\u1090", "\u101b"); | |
output = output.replaceAll("\\u1040", "\u1040"); | |
output = output.replaceAll("\\u108f", "\u1014"); | |
output = output.replaceAll("\\u1012", "\u1012"); | |
output = output.replaceAll("\\u1013", "\u1013"); | |
output = output.replaceAll("[\\u103d\\u1087]", "\u103e"); | |
output = output.replaceAll("\\u103c", "\u103d"); | |
output = output.replaceAll("[\\u103b\\u107e\\u107f\\u1080\\u1081\\u1082\\u1083\\u1084]", "\u103c"); | |
output = output.replaceAll("[\\u103a\\u107d]", "\u103b"); | |
output = output.replaceAll("\\u103d\\u103b", "\u103b\u103d"); | |
output = output.replaceAll("\\u108a","\u103d\u103e"); | |
output = output.replaceAll("\\u103e\\u103d", "\u103d\u103e"); | |
output = output.replaceAll("((?:\\u1031)?)((?:\\u103c)?)([\\u1000-\\u1021])\\u1064", "\u1064$1$2$3"); | |
output = output.replaceAll("((?:\\u1031)?)((?:\\u103c)?)([\\u1000-\\u1021])\\u108b", "\u1064$1$2$3\u102d"); | |
output = output.replaceAll("((?:\\u1031)?)((?:\\u103c)?)([\\u1000-\\u1021])\\u108c", "\u1064$1$2$3\u102e"); | |
output = output.replaceAll("((?:\\u1031)?)((?:\\u103c)?)([\\u1000-\\u1021])\\u108d", "\u1064$1$2$3\u1036"); | |
output = output.replaceAll("\\u105a", "\u102b\u103a"); | |
output = output.replaceAll("\\u108e", "\u102d\u1036"); | |
output = output.replaceAll("\\u1033", "\u102f"); | |
output = output.replaceAll("\\u1034", "\u1030"); | |
output = output.replaceAll("\\u1088", "\u103e\u102f"); | |
output = output.replaceAll("\\u1089", "\u103e\u1030"); | |
output = output.replaceAll("\\u1039", "\u103a"); | |
output = output.replaceAll("[\\u1094\\u1095]", "\u1037"); | |
output = output.replaceAll("([\\u1000-\\u1021])([\\u102c\\u102d\\u102e\\u1032\\u1036]){1,2}([\\u1060\\u1061\\u1062\\u1063\\u1065\\u1066\\u1067\\u1068\\u1069\\u1070\\u1071\\u1072\\u1073\\u1074\\u1075\\u1076\\u1077\\u1078\\u1079\\u107a\\u107b\\u107c\\u1085])", "$1$3$2"); | |
output = output.replaceAll("\\u1064", "\u1004\u103a\u1039"); | |
output = output.replaceAll("\\u104e", "\u104e\u1004\u103a\u1038"); | |
output = output.replaceAll("\\u1086", "\u103f"); | |
output = output.replaceAll("\\u1060", "\u1039\u1000"); | |
output = output.replaceAll("\\u1061", "\u1039\u1001"); | |
output = output.replaceAll("\\u1062", "\u1039\u1002"); | |
output = output.replaceAll("\\u1063", "\u1039\u1003"); | |
output = output.replaceAll("\\u1065", "\u1039\u1005"); | |
output = output.replaceAll("[\\u1066\\u1067]", "\u1039\u1006"); | |
output = output.replaceAll("\\u1068", "\u1039\u1007"); | |
output = output.replaceAll("\\u1069", "\u1039\u1008"); | |
output = output.replaceAll("\\u106c", "\u1039\u100b"); | |
output = output.replaceAll("\\u1070", "\u1039\u100f"); | |
output = output.replaceAll("[\\u1071\\u1072]", "\u1039\u1010"); | |
output = output.replaceAll("[\\u1073\\u1074]", "\u1039\u1011"); | |
output = output.replaceAll("\\u1075", "\u1039\u1012"); | |
output = output.replaceAll("\\u1076", "\u1039\u1013"); | |
output = output.replaceAll("\\u1077", "\u1039\u1014"); | |
output = output.replaceAll("\\u1078", "\u1039\u1015"); | |
output = output.replaceAll("\\u1079", "\u1039\u1016"); | |
output = output.replaceAll("\\u107a", "\u1039\u1017"); | |
output = output.replaceAll("\\u107b", "\u1039\u1018"); | |
output = output.replaceAll("\\u107c", "\u1039\u1019"); | |
output = output.replaceAll("\\u1085", "\u1039\u101c"); | |
output = output.replaceAll("\\u106d", "\u1039\u100c"); | |
output = output.replaceAll("\\u1091", "\u100f\u1039\u100d"); | |
output = output.replaceAll("\\u1092", "\u100b\u1039\u100c"); | |
output = output.replaceAll("\\u1097", "\u100b\u1039\u100b"); | |
output = output.replaceAll("\\u106f", "\u100e\u1039\u100d"); | |
output = output.replaceAll("\\u106e", "\u100d\u1039\u100d"); | |
output = output.replaceAll("(\\u103c)([\\u1000-\\u1021])((?:\\u1039[\\u1000-\\u1021])?)", "$2$3$1"); | |
output = output.replaceAll("(\\u103d)(\\u103d)([\\u103b\\u103c])", "$3$2$1"); | |
output = output.replaceAll("(\\u103d)([\\u103b\\u103c])", "$2$1"); | |
output = output.replaceAll("(\\u103d)([\\u103b\\u103c])", "$2$1"); | |
output = output.replaceAll("(?<=([\\u1000-\\u101c\\u101e-\\u102a\\u102c\\u102e-\\u103d\\u104c-\\u109f\\s]))(\\u1047)", "\u101b"); | |
output = output.replaceAll("(\\u1047)(?=[\\u1000-\\u101c\\u101e-\\u102a\\u102c\\u102e-\\u103d\\u104c-\\u109f\\s])", "\u101b"); | |
output = output.replaceAll("((?:\\u1031)?)([\\u1000-\\u1021])((?:\\u1039[\\u1000-\\u1021])?)((?:[\\u102d\\u102e\\u1032])?)([\\u1036\\u1037\\u1038]{0,2})([\\u103b-\\u103e]{0,3})((?:[\\u102f\\u1030])?)([\\u1036\\u1037\\u1038]{0,2})((?:[\\u102d\\u102e\\u1032])?)", "$2$3$6$1$4$9$7$5$8"); | |
output = output.replaceAll("\\u1036\\u102f", "\u102f\u1036"); | |
output = output.replaceAll("(\\u103a)(\\u1037)", "$2$1"); | |
if(notAppend) | |
return output; | |
CharSequence appendOutput = TextUtils.concat(output, "========", input); | |
return appendOutput; | |
}else{ | |
return input; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment