doraTeX/word2tipa.swift

## word2tipa.swift
import Foundation

/// 設定のデフォルト値
// (ə)のような括弧で囲まれた発音記号の扱い
enum ParenTreatment: Int {
    case none = 1, // そのまま
         flatten, // 括弧を開く
         remove // 発音記号ごと削除する
}
// デフォルトは(ə)などはそのまま
var parenTreatment: ParenTreatment = .none

// 単音節単語に強勢を付けるか
var stressMonosyllabicWord = false

// 第二強勢を削除するか
var removeSecondStress = false

// 発音記号が見つからないときの代替記号
let NOTFOUND = "☃"


extension String {
    var stringArray: [String] {
        get {
            return self.map{ String($0) }
        }
    }

    mutating func replaceAllOccurrences(of pattern: String, replace: (String, [String]) -> String) {
        do {
            let regex = try NSRegularExpression(pattern: pattern, options: [])
            let result = NSMutableString(string: self)

            let matches = regex.matches(in: self, options: [], range: NSRange(location: 0, length: self.utf16.count))

            for (_, match) in matches.enumerated().reversed() {
                let range = match.range
                var groups: [String] = []
                for i in 1..<match.numberOfRanges {
                    groups.append(result.substring(with: match.range(at:i)))
                }
                result.replaceCharacters(in: range, with: replace(result.substring(with: range), groups))
            }

            self = result as String
        } catch {

        }
    }

    mutating func replaceFirstOccurrence(of pattern: String, replace: (String) -> String) {
        if let range = self.range(of: pattern, options: .regularExpression) {
            self.replaceSubrange(range, with: replace(String(self[range])))
        }
    }
}

class DictionaryServiceManager {
    private let AppleGlobalDomainName = "Apple Global Domain"
    private let DictionaryServicesKey = "com.apple.DictionaryServices"
    private let ActiveDictionariesKey = "DCSActiveDictionaries"

    private var globalDomain: [String:Any]? { UserDefaults.standard.persistentDomain(forName: AppleGlobalDomainName) }
    private var dictionaryPreferences: [String:AnyObject]? { self.globalDomain?[DictionaryServicesKey] as! [String:AnyObject]? }
    private var currentDictionaryList: [String]? { self.dictionaryPreferences?[ActiveDictionariesKey] as! [String]? }

    private func setUserDictPreferences(_ activeDictionaries: [String]) {
        if var currentPref = self.dictionaryPreferences {
            currentPref[ActiveDictionariesKey] = activeDictionaries as AnyObject
            if var gDomain = self.globalDomain {
                gDomain[DictionaryServicesKey] = currentPref
                UserDefaults.standard.setPersistentDomain(gDomain, forName: AppleGlobalDomainName)
            }
        }
    }

    func lookUp(_ word: String, inDictionary dictionaryPath: String) -> String? {
        let currentPrefs = self.currentDictionaryList
        self.setUserDictPreferences([dictionaryPath])

        let range = CFRangeMake(0, word.utf16.count)
        let result = DCSCopyTextDefinition(nil, word as CFString, range)?.takeRetainedValue() as String?

        if let currentPrefs = currentPrefs {
            self.setUserDictPreferences(currentPrefs)
        }

        return result
    }
}

class TipaConverter {
    func lookUpPronunciationSymbolsInWisdom(_ word: String) -> String? {
        let dicName = "Sanseido The WISDOM English-Japanese Japanese-English Dictionary.dictionary"
        let wisdomRegex = "\\|.*?[\\|/,]"
        guard let content = DictionaryServiceManager().lookUp(word, inDictionary: dicName),
              let match = content.range(of: wisdomRegex, options: .regularExpression) else { return nil }
        let startIndex = content.index(after: match.lowerBound)
        let endIndex = content.index(before: match.upperBound)
        let range = startIndex..<endIndex
        var pron =  String(content[range])

        pron.replaceFirstOccurrence(of: "^\\s*", replace: {_ in ""})
        pron.replaceFirstOccurrence(of: "\\s*$", replace: {_ in ""})

        switch parenTreatment {
        case .none:
            break
        case .flatten:
            pron.replaceAllOccurrences(of: "\\(|\\)", replace: {(_, _) in ""})
        case .remove:
            pron.replaceAllOccurrences(of: "\\(.*?\\)", replace: {(_, _) in ""})
        }
        pron = arrangePronunciation(pron)

        return pron
    }

    func convertToTipa(_ pronunciation: String) -> String {
        var result = ""
        for ch in pronunciation.decomposedStringWithCanonicalMapping.stringArray {
            var char = ch
            let s = ch as NSString
            // 歯音化記号 subscript bridge (U+032A) が入っている場合の対処
            if (s.length > 1) && (s.character(at: 1) == 0x032A) {
                char = NSString(format:"%C", s.character(at: 0)) as String
                result += "\\textsubbridge "
            }
            switch char {
                // 記号
            case "ː":
                result += ":"
            case "ˈ":
                result += "\""
            case "ˌ":
                result += "\"\""
                // 子音
            case "ʃ":
                result += "S"
            case "ʒ":
                result += "Z"
            case "θ":
                result += "T"
            case "ð":
                result += "D"
            case "ŋ":
                result += "N"
            case "ɡ":
                result += "g"
            case "ʔ":
                result += "P"
            case "ɾ":
                result += "\\textfishhookr "
                // ə
            case "ə́":
                result += "\\'@"
            case "ə̀":
                result += "\\`@"
            case "ə":
                result += "@"
                // e
            case "é":
                result += "\\'e"
            case "è":
                result += "\\`e"
                // u
            case "ú":
                result += "\\'u"
            case "ù":
                result += "\\`u"
                // ʌ
            case "ʌ́":
                result += "\\'2"
            case "ʌ̀":
                result += "\\`2"
            case "ʌ":
                result += "2"
                // ʊ
            case "ʊ́":
                result += "\\'U"
            case "ʊ̀":
                result += "\\`U"
            case "ʊ":
                result += "U"
                // o
            case "ó":
                result += "\\'o"
            case "ò":
                result += "\\`o"
                // ɔ
            case "ɔ́":
                result += "\\'O"
            case "ɔ̀":
                result += "\\`O"
            case "ɔ":
                result += "O"
                // ɪ
            case "ɪ́":
                result += "\\'I"
            case "ɪ̀":
                result += "\\`I"
            case "ɪ":
                result += "I"
                // i
            case "í":
                result += "\\'\\i "
            case "ì":
                result += "\\`\\i "
                // æ
            case "ǽ":
                result += "\\'\\ae "
            case "æ̀":
                result += "\\`\\ae "
            case "æ":
                result += "\\ae "
                // ɑ
            case "ɑ́":
                result += "\\'A"
            case "ɑ̀":
                result += "\\`A"
            case "ɑ":
                result += "A"
                // ɒ
            case "ɒ́":
                result += "\\'6"
            case "ɒ̀":
                result += "\\`6"
            case "ɒ":
                result += "6"
                // a
            case "á":
                result += "\\'a"
            case "à":
                result += "\\`a"
                // ɜ
            case "ɜ́":
                result += "\\'3"
            case "ɜ̀":
                result += "\\`3"
            case "ɜ":
                result += "3"
                // ɛ
            case "ɛ́":
                result += "\\'E"
            case "ɛ̀":
                result += "\\`E"
            case "ɛ":
                result += "E"
            default:
                result += char
            }
        }

        result.replaceAllOccurrences(of: " ([\\\\:\\(\\)\\]/])", replace: {(_, groups) in groups[0]})
        return result
    }

    private func stressIncluded(_ pronunciation: String) -> Bool {
        return pronunciation.precomposedStringWithCanonicalMapping.range(of: "ə́|é|ú|ʌ́|ʊ́|ó|ɔ́|ɪ́|í|ǽ|ɑ́|ɒ́|á|ɜ́|ɛ́".precomposedStringWithCanonicalMapping, options:.regularExpression) != nil
    }

    private func arrangePronunciation(_ pronunciation: String) -> String {
        var pron = pronunciation

        // 単音節単語に強勢を付ける
        if stressMonosyllabicWord, !stressIncluded(pron) {
            pron.replaceFirstOccurrence(of: "[eɑɒʌaoɔɪiæuʊəɜɛ]", replace: { vowel in vowel == "i" ? "í" : vowel + "\u{0301}" })
        }

        // 第二強勢を削除する
        if removeSecondStress {
            pron = pron.decomposedStringWithCanonicalMapping
            pron.replaceFirstOccurrence(of: "\u{0300}", replace: {_ in ""})
        }

        return pron
    }

    func wordToTipa(_ word: String, enclose: Bool = false) -> String {
        guard let pronunciation = lookUpPronunciationSymbolsInWisdom(word) else { return NOTFOUND }
        let result = convertToTipa(pronunciation)
        return "\\textipa{\(result)}"
    }
}


func showHelpMessage() {
    print("[Usage]")
    print("  word2tipa [options] word(s)...")
    print("[Options]")
    print("  --flattenParen : flatten an enclosed symbol (e.g., (ə) → ə)")
    print("  --removeParen : remove an enclosed symbol (e.g., (ə))")
    print("  --stressMonosyllabicWord : e.g., /bɪɡ/ → /bɪ́ɡ/")
    print("  --removeSecondStress : e.g., /ɪ̀ntərfɪ́ər/ → /ɪntərfɪ́ər/")
}

/// Main

// コマンドラインオプションを取得してデフォルト設定を変更
var arguments = CommandLine.arguments.dropFirst()
while true {
    if arguments.isEmpty {
        print("[ERROR] no arguments\n")
        showHelpMessage()
        exit(1)
    }
    if arguments.first?.first == "-" {
        let option = arguments.first!
        arguments = arguments.dropFirst()
        switch option.lowercased() {
        case "--flattenParen".lowercased():
            parenTreatment = .flatten
            break
        case "--removeParen".lowercased():
            parenTreatment = .remove
            break
        case "--stressMonosyllabicWord".lowercased():
            stressMonosyllabicWord = true
            break
        case "--removeSecondStress".lowercased():
            removeSecondStress = true
            break
        case "--help".lowercased():
            showHelpMessage()
            exit(0)
        default:
            print("[ERROR] unrecognized argument: \(option)\n")
            showHelpMessage()
            exit(1)
        }
    } else {
        break
    }
}

print(arguments.map{TipaConverter().wordToTipa($0)}.joined(separator: " "), terminator: "")
	import Foundation

	/// 設定のデフォルト値
	// (ə)のような括弧で囲まれた発音記号の扱い
	enum ParenTreatment: Int {
	case none = 1, // そのまま
	flatten, // 括弧を開く
	remove // 発音記号ごと削除する
	}
	// デフォルトは(ə)などはそのまま
	var parenTreatment: ParenTreatment = .none

	// 単音節単語に強勢を付けるか
	var stressMonosyllabicWord = false

	// 第二強勢を削除するか
	var removeSecondStress = false

	// 発音記号が見つからないときの代替記号
	let NOTFOUND = "☃"


	extension String {
	var stringArray: [String] {
	get {
	return self.map{ String($0) }
	}
	}

	mutating func replaceAllOccurrences(of pattern: String, replace: (String, [String]) -> String) {
	do {
	let regex = try NSRegularExpression(pattern: pattern, options: [])
	let result = NSMutableString(string: self)

	let matches = regex.matches(in: self, options: [], range: NSRange(location: 0, length: self.utf16.count))

	for (_, match) in matches.enumerated().reversed() {
	let range = match.range
	var groups: [String] = []
	for i in 1..<match.numberOfRanges {
	groups.append(result.substring(with: match.range(at:i)))
	}
	result.replaceCharacters(in: range, with: replace(result.substring(with: range), groups))
	}

	self = result as String
	} catch {

	}
	}

	mutating func replaceFirstOccurrence(of pattern: String, replace: (String) -> String) {
	if let range = self.range(of: pattern, options: .regularExpression) {
	self.replaceSubrange(range, with: replace(String(self[range])))
	}
	}
	}

	class DictionaryServiceManager {
	private let AppleGlobalDomainName = "Apple Global Domain"
	private let DictionaryServicesKey = "com.apple.DictionaryServices"
	private let ActiveDictionariesKey = "DCSActiveDictionaries"

	private var globalDomain: [String:Any]? { UserDefaults.standard.persistentDomain(forName: AppleGlobalDomainName) }
	private var dictionaryPreferences: [String:AnyObject]? { self.globalDomain?[DictionaryServicesKey] as! [String:AnyObject]? }
	private var currentDictionaryList: [String]? { self.dictionaryPreferences?[ActiveDictionariesKey] as! [String]? }

	private func setUserDictPreferences(_ activeDictionaries: [String]) {
	if var currentPref = self.dictionaryPreferences {
	currentPref[ActiveDictionariesKey] = activeDictionaries as AnyObject
	if var gDomain = self.globalDomain {
	gDomain[DictionaryServicesKey] = currentPref
	UserDefaults.standard.setPersistentDomain(gDomain, forName: AppleGlobalDomainName)
	}
	}
	}

	func lookUp(_ word: String, inDictionary dictionaryPath: String) -> String? {
	let currentPrefs = self.currentDictionaryList
	self.setUserDictPreferences([dictionaryPath])

	let range = CFRangeMake(0, word.utf16.count)
	let result = DCSCopyTextDefinition(nil, word as CFString, range)?.takeRetainedValue() as String?

	if let currentPrefs = currentPrefs {
	self.setUserDictPreferences(currentPrefs)
	}

	return result
	}
	}

	class TipaConverter {
	func lookUpPronunciationSymbolsInWisdom(_ word: String) -> String? {
	let dicName = "Sanseido The WISDOM English-Japanese Japanese-English Dictionary.dictionary"
	let wisdomRegex = "\\\|.*?[\\\|/,]"
	guard let content = DictionaryServiceManager().lookUp(word, inDictionary: dicName),
	let match = content.range(of: wisdomRegex, options: .regularExpression) else { return nil }
	let startIndex = content.index(after: match.lowerBound)
	let endIndex = content.index(before: match.upperBound)
	let range = startIndex..<endIndex
	var pron = String(content[range])

	pron.replaceFirstOccurrence(of: "^\\s*", replace: {_ in ""})
	pron.replaceFirstOccurrence(of: "\\s*$", replace: {_ in ""})

	switch parenTreatment {
	case .none:
	break
	case .flatten:
	pron.replaceAllOccurrences(of: "\\(\|\\)", replace: {(_, _) in ""})
	case .remove:
	pron.replaceAllOccurrences(of: "\\(.*?\\)", replace: {(_, _) in ""})
	}
	pron = arrangePronunciation(pron)

	return pron
	}

	func convertToTipa(_ pronunciation: String) -> String {
	var result = ""
	for ch in pronunciation.decomposedStringWithCanonicalMapping.stringArray {
	var char = ch
	let s = ch as NSString
	// 歯音化記号 subscript bridge (U+032A) が入っている場合の対処
	if (s.length > 1) && (s.character(at: 1) == 0x032A) {
	char = NSString(format:"%C", s.character(at: 0)) as String
	result += "\\textsubbridge "
	}
	switch char {
	// 記号
	case "ː":
	result += ":"
	case "ˈ":
	result += "\""
	case "ˌ":
	result += "\"\""
	// 子音
	case "ʃ":
	result += "S"
	case "ʒ":
	result += "Z"
	case "θ":
	result += "T"
	case "ð":
	result += "D"
	case "ŋ":
	result += "N"
	case "ɡ":
	result += "g"
	case "ʔ":
	result += "P"
	case "ɾ":
	result += "\\textfishhookr "
	// ə
	case "ə́":
	result += "\\'@"
	case "ə̀":
	result += "\\`@"
	case "ə":
	result += "@"
	// e
	case "é":
	result += "\\'e"
	case "è":
	result += "\\`e"
	// u
	case "ú":
	result += "\\'u"
	case "ù":
	result += "\\`u"
	// ʌ
	case "ʌ́":
	result += "\\'2"
	case "ʌ̀":
	result += "\\`2"
	case "ʌ":
	result += "2"
	// ʊ
	case "ʊ́":
	result += "\\'U"
	case "ʊ̀":
	result += "\\`U"
	case "ʊ":
	result += "U"
	// o
	case "ó":
	result += "\\'o"
	case "ò":
	result += "\\`o"
	// ɔ
	case "ɔ́":
	result += "\\'O"
	case "ɔ̀":
	result += "\\`O"
	case "ɔ":
	result += "O"
	// ɪ
	case "ɪ́":
	result += "\\'I"
	case "ɪ̀":
	result += "\\`I"
	case "ɪ":
	result += "I"
	// i
	case "í":
	result += "\\'\\i "
	case "ì":
	result += "\\`\\i "
	// æ
	case "ǽ":
	result += "\\'\\ae "
	case "æ̀":
	result += "\\`\\ae "
	case "æ":
	result += "\\ae "
	// ɑ
	case "ɑ́":
	result += "\\'A"
	case "ɑ̀":
	result += "\\`A"
	case "ɑ":
	result += "A"
	// ɒ
	case "ɒ́":
	result += "\\'6"
	case "ɒ̀":
	result += "\\`6"
	case "ɒ":
	result += "6"
	// a
	case "á":
	result += "\\'a"
	case "à":
	result += "\\`a"
	// ɜ
	case "ɜ́":
	result += "\\'3"
	case "ɜ̀":
	result += "\\`3"
	case "ɜ":
	result += "3"
	// ɛ
	case "ɛ́":
	result += "\\'E"
	case "ɛ̀":
	result += "\\`E"
	case "ɛ":
	result += "E"
	default:
	result += char
	}
	}

	result.replaceAllOccurrences(of: " ([\\\\:\\(\\)\\]/])", replace: {(_, groups) in groups[0]})
	return result
	}

	private func stressIncluded(_ pronunciation: String) -> Bool {
	return pronunciation.precomposedStringWithCanonicalMapping.range(of: "ə́\|é\|ú\|ʌ́\|ʊ́\|ó\|ɔ́\|ɪ́\|í\|ǽ\|ɑ́\|ɒ́\|á\|ɜ́\|ɛ́".precomposedStringWithCanonicalMapping, options:.regularExpression) != nil
	}

	private func arrangePronunciation(_ pronunciation: String) -> String {
	var pron = pronunciation

	// 単音節単語に強勢を付ける
	if stressMonosyllabicWord, !stressIncluded(pron) {
	pron.replaceFirstOccurrence(of: "[eɑɒʌaoɔɪiæuʊəɜɛ]", replace: { vowel in vowel == "i" ? "í" : vowel + "\u{0301}" })
	}

	// 第二強勢を削除する
	if removeSecondStress {
	pron = pron.decomposedStringWithCanonicalMapping
	pron.replaceFirstOccurrence(of: "\u{0300}", replace: {_ in ""})
	}

	return pron
	}

	func wordToTipa(_ word: String, enclose: Bool = false) -> String {
	guard let pronunciation = lookUpPronunciationSymbolsInWisdom(word) else { return NOTFOUND }
	let result = convertToTipa(pronunciation)
	return "\\textipa{\(result)}"
	}
	}


	func showHelpMessage() {
	print("[Usage]")
	print(" word2tipa [options] word(s)...")
	print("[Options]")
	print(" --flattenParen : flatten an enclosed symbol (e.g., (ə) → ə)")
	print(" --removeParen : remove an enclosed symbol (e.g., (ə))")
	print(" --stressMonosyllabicWord : e.g., /bɪɡ/ → /bɪ́ɡ/")
	print(" --removeSecondStress : e.g., /ɪ̀ntərfɪ́ər/ → /ɪntərfɪ́ər/")
	}

	/// Main

	// コマンドラインオプションを取得してデフォルト設定を変更
	var arguments = CommandLine.arguments.dropFirst()
	while true {
	if arguments.isEmpty {
	print("[ERROR] no arguments\n")
	showHelpMessage()
	exit(1)
	}
	if arguments.first?.first == "-" {
	let option = arguments.first!
	arguments = arguments.dropFirst()
	switch option.lowercased() {
	case "--flattenParen".lowercased():
	parenTreatment = .flatten
	break
	case "--removeParen".lowercased():
	parenTreatment = .remove
	break
	case "--stressMonosyllabicWord".lowercased():
	stressMonosyllabicWord = true
	break
	case "--removeSecondStress".lowercased():
	removeSecondStress = true
	break
	case "--help".lowercased():
	showHelpMessage()
	exit(0)
	default:
	print("[ERROR] unrecognized argument: \(option)\n")
	showHelpMessage()
	exit(1)
	}
	} else {
	break
	}
	}

	print(arguments.map{TipaConverter().wordToTipa($0)}.joined(separator: " "), terminator: "")