Skip to content

Instantly share code, notes, and snippets.

@dorentus
Last active February 19, 2021 10:35
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dorentus/4cd6e80145ea140495f4 to your computer and use it in GitHub Desktop.
Save dorentus/4cd6e80145ea140495f4 to your computer and use it in GitHub Desktop.
#!/usr/bin/env xcrun swift
import Foundation
import CoreText
func input_from_stdin() -> String {
var input = ""
let stdin = FileHandle.standardInput
let zeroData = Data()
while true {
let input_data = stdin.availableData
if input_data == zeroData {
break
}
if let s = NSString(data: input_data, encoding: String.Encoding.utf8.rawValue) as? String {
input += s
}
}
return input
}
func input_from_args(_ arguments: [String]) -> String {
var args = arguments
args.remove(at: 0)
return args.joined(separator: " ") + "\n"
}
func input() -> String {
let arguments = CommandLine.arguments
if arguments.count < 2 {
return input_from_stdin()
}
else {
return input_from_args(arguments)
}
}
func languageOf(_ input: String) -> String? {
let length = input.utf16.count
return CFStringTokenizerCopyBestStringLanguage(input as CFString, CFRange(location: 0, length: length)) as String?
}
func normalized(_ input: String) -> String {
let buffer = NSMutableString(string: input) as CFMutableString
CFStringTransform(buffer, nil, kCFStringTransformStripCombiningMarks, false)
return String(buffer)
}
func romanized(_ input: String) -> String {
let buffer = NSMutableString(string: input) as CFMutableString
CFStringTransform(buffer, nil, kCFStringTransformToLatin, false)
return String(buffer)
}
func tokenized_romanized(_ input: String) -> [String] {
var locale: Locale
if let lang = languageOf(input) {
locale = Locale(identifier: lang)
}
else {
locale = Locale.current
}
let tokenizer = CFStringTokenizerCreate(kCFAllocatorDefault, input as CFString, CFRange(location: 0, length: input.utf16.count), 0, locale as CFLocale!)
var result: [String] = []
var i = CFStringTokenizerAdvanceToNextToken(tokenizer)
while i != CFStringTokenizerTokenType() {
let t = CFStringTokenizerCopyCurrentTokenAttribute(tokenizer, 1<<16) as! String
result.append(t)
i = CFStringTokenizerAdvanceToNextToken(tokenizer)
}
return result
}
let input_str = input()
let tokens = tokenized_romanized(input_str)
let script = tokens.joined(separator: " ")
print("\u{001B}[36m\(script)\u{001B}[0m")
print("\u{001B}[33m\(normalized(script))\u{001B}[0m")
@dorentus
Copy link
Author

Examples:

$ romanized 'Bigger than bigger.'
Bigger than bigger

$ romanized 比更大还更大
bi geng dahuan gengda

$ romanized 豈止於大。
qizhi yu da

$ romanized 大きさ以上に大きく進化
ooki sa ijou ni ookiku shinka

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment