Last active
August 29, 2015 14:10
-
-
Save joeminicucci/646bd16b672ce5aeff8d to your computer and use it in GitHub Desktop.
chinese plugin mods
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# | |
# Copyright © 2014 Thomas TEMPÉ, <thomas.tempe@alysse.org> | |
# | |
# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html | |
# | |
#COPYRIGHT AND PERMISSION NOTICE | |
#Copyright © 1991-2012 Unicode, Inc. All rights reserved. Distributed under the Terms of Use in http://www.unicode.org/copyright.html. | |
#Permission is hereby granted, free of charge, to any person obtaining a copy of the Unicode data files and any associated documentation (the "Data Files") or Unicode software and any associated documentation (the "Software") to deal in the Data Files or Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sell copies of the Data Files or Software, and to permit persons to whom the Data Files or Software are furnished to do so, provided that (a) the above copyright notice(s) and this permission notice appear with all copies of the Data Files or Software, (b) both the above copyright notice(s) and this permission notice appear in associated documentation, and (c) there is clear notice in each modified Data File or in the Software as well as in the documentation associated with the Data File(s) or Software that the data or software has been modified. | |
#THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA FILES OR SOFTWARE. | |
#Except as contained in this notice, the name of a copyright holder shall not be used in advertising or otherwise to promote the sale, use or other dealings in these Data Files or Software without prior written authorization of the copyright holder. | |
"""Interface to the db/chinese_dict.sql SQLite database containing the local dictionaries | |
Available dictionaries: | |
* Chinese characters (Unihan) | |
* Chinese words (CEDICT), including: | |
* simplified and traditional spellings | |
* pinyin and Taiwan variant pronunciations | |
* English, German and French translations | |
unihan table structure: | |
["cp", "kMandarin", "kCantonese", "kFrequency", "kHangul", "kJapaneseKun", "kSimplifiedVariant", "kTraditionalVariant", "Vietnamese"] | |
cidian table structure: | |
["traditional", "simplified", "pinyin", "pinyin_taiwan", "classifiers", "alternates", "english", "german", "french", "spanish"] | |
""" | |
import sqlite3 | |
import os.path | |
class DictDB: | |
conn = None | |
c = None | |
def __init__(self): | |
try: | |
from aqt import mw | |
db_file = os.path.join(mw.pm.addonFolder(), "chinese", "db", "chinese_dict.sqlite") | |
except: #Used for local debugging | |
db_file = "db/chinese_dict.sqlite" | |
self.conn=sqlite3.connect(db_file) | |
self.c = self.conn.cursor() | |
#Create the DB indexes. | |
#Only works the first time. | |
#These indexes are removed from the distribution files, in order to save space | |
try: | |
self.c.execute("create index isimplified on cidian ( simplified );") | |
self.c.execute("create unique index itraditional on cidian ( traditional, pinyin );") | |
self.conn.commit() | |
except: | |
pass | |
def _get_char_pinyin(self, c): | |
"""returns the pinyin transcription of a given Hanzi from Unihan. | |
If it's not in the dictionary, return the original text. | |
If there are multiple possibilities, returns one at random. | |
""" | |
self.c.execute("select kMandarin from hanzi where cp = ?;", (c,) ) | |
try: | |
(pinyin,) = self.c.fetchone() | |
return pinyin | |
except: | |
return None | |
#!Changes! (There was an problem with Taiwan_pinyin being converted one-to-one instead of by each word group. | |
# Originally, this code would check if (len(taiwan_pinyin)>0). However, this would result in an exception because if a word was not found in the dictionary | |
# None would be returned, not an empty string. I've changed it to check for a None instead. Only if the pinyin (not the taiwan_pinyin) is not in the dictionary | |
# should an exception be thrown (because the self.c.execute returns None in this case). I am not sure whether it was intended for the databse to return None, but | |
# the following code allows Taiwan_pinyin to be checked word-by-word as well.) | |
def _get_word_pinyin(self, w, taiwan=False): | |
"""Returns the pinyin transcription of a word, from CEDICT. | |
If it's not in the dictionary, returns None. | |
If there are multiple possibilities, returns one at random. | |
if taiwan==True then prefer Taiwan variant | |
""" | |
self.c.execute("select pinyin, pinyin_taiwan from cidian where traditional=? or simplified=?;", (w, w)) | |
try: | |
pinyin, taiwan_pinyin = self.c.fetchone() | |
#self.c.fetchone() gives a None value for taiwan_pinyin if the pinyin_taiwan does not exist. | |
#If the pinyin doesn't exist, self.c.fetchone() returns None and an exception is made due to trying to assign a tuple which does not exist | |
if taiwan and taiwan_pinyin is not None: | |
return taiwan_pinyin | |
else: | |
return pinyin | |
except: | |
#Not in dictionary | |
return None | |
#/!Changes! | |
#!Changes! (I've increased the word length that is searched up in the dictionary, in order to recognise chengyu (4 character idioms) and 3 character words etc. | |
#However, the word length can also be manually set if 4 is not enough | |
def get_pinyin(self, w, taiwan=False, wl=4): | |
"""Returns the full pinyin transcription of a string. | |
Use CEDICT wherever possible. Use Unihan to fill in. | |
if taiwan==True then prefer Taiwan variant | |
""" | |
p = self._get_word_pinyin(w, taiwan) | |
if p: | |
return p #one word, in dictionary | |
if len(w)==1: | |
return self._get_char_pinyin(w) #single character | |
#We're looking up a string that's not in the dictionary | |
#We'll try each 4-character sequence in turn, then 3-sequence, then 2-sequence and if those fails, do unit lookup. | |
#transcription = u"" | |
transcription = u"" | |
w = w[:] | |
last_was_pinyin = False | |
while len(w)>0: | |
word_was_found = False | |
word_len = wl | |
while word_len > 1: | |
p = self._get_word_pinyin(w[:word_len], taiwan) | |
if p: | |
transcription = add_with_space(transcription, p) | |
w = w[word_len:] | |
last_was_pinyin = True | |
word_was_found = True | |
break | |
word_len -= 1 | |
if word_was_found == False: | |
p = self._get_char_pinyin(w[0]) | |
if p: | |
transcription = add_with_space(transcription, p) | |
last_was_pinyin = True | |
else: | |
#add character directly. | |
#Pad with spaces appropriately | |
if last_was_pinyin: | |
transcription+=" " | |
transcription+=w[0] | |
last_was_pinyin = False | |
w = w[1:] | |
return transcription | |
#!/Changes!# | |
def get_cantonese(self, w, only_one=True): | |
"""Returns a character-by-character cantonese transcription.""" | |
t = u"" | |
for c in w: | |
self.c.execute("select kCantonese from hanzi where cp = ?;", (c,) ) | |
try: | |
(k,) = self.c.fetchone() | |
if only_one: | |
k = k.split(" ")[0] | |
else: | |
k = k.replace(" ", "|") | |
t = add_with_space(t, k) | |
except: | |
t+=c | |
return t | |
#!Changes! (I've applied the same technique used for converting pinyin, and applied it to converting simplified to traditional vice versa. | |
# Originally, the characters were being coverted one-for-one, which caused many incorrect conversions. E.g. 面 and 麵. Also, conversion now | |
# uses CEDICT instead of Unihan except for single characters) | |
def _get_char_traditional(self, c): | |
"""Uses Unihan to find a traditional variant""" | |
self.c.execute("select kTraditionalVariant from hanzi where cp = ?;", (c,) ) | |
try: | |
(k,) = self.c.fetchone() | |
return k | |
except: | |
return None | |
def _get_word_traditional(self, w): | |
"""Uses CEDICT to find a traditional variant""" | |
self.c.execute("select traditional from cidian where traditional=? or simplified=?;", (w, w) ) | |
try: | |
(k,) = self.c.fetchone() | |
return k | |
except: | |
return None | |
def get_traditional(self, w, wl=4): | |
"""Returns the full traditional form of a string. | |
Use CEDICT wherever possible. Use Unihan to fill in. | |
""" | |
p = self._get_word_traditional(w) | |
if p: | |
return p #one word, in dictionary | |
if len(w)==1: | |
return self._get_char_traditional(w) #single character | |
#We're looking up a string that's not in the dictionary | |
#We'll try each 4-character sequence in turn, then 3-sequence, then 2-sequence and if those fails, do unit lookup. | |
traditional = u"" | |
w = w[:] | |
while len(w)>0: | |
word_was_found = False | |
word_len = wl | |
while word_len > 1: | |
p = self._get_word_traditional(w[:word_len]) | |
if p: | |
traditional += p | |
w = w[word_len:] | |
word_was_found = True | |
break | |
word_len -= 1 | |
if word_was_found == False: | |
p = self._get_char_traditional(w[0]) | |
if p: | |
traditional += p | |
else: | |
#add character directly. | |
traditional+=w[0] | |
w = w[1:] | |
return traditional | |
def _get_char_simplified(self, c): | |
"""Uses Unihan to find a simplified variant""" | |
self.c.execute("select kSimplifiedVariant from hanzi where cp = ?;", (c,) ) | |
try: | |
(k,) = self.c.fetchone() | |
return k | |
except: | |
return None | |
def _get_word_simplified(self, w): | |
"""Uses CEDICT to find a traditional variant""" | |
self.c.execute("select simplified from cidian where traditional=? or simplified=?;", (w, w) ) | |
try: | |
(k,) = self.c.fetchone() | |
return k | |
except: | |
return None | |
def get_simplified(self, w, wl=4): | |
"""Returns the full traditional form of a string. | |
Use CEDICT wherever possible. Use Unihan to fill in. | |
""" | |
p = self._get_word_simplified(w) | |
if p: | |
return p #one word, in dictionary | |
if len(w)==1: | |
return self._get_char_simplified(w) #single character | |
#We're looking up a string that's not in the dictionary | |
#We'll try each 4-character sequence in turn, then 3-sequence, then 2-sequence and if those fails, do unit lookup. | |
simplified = u"" | |
w = w[:] | |
while len(w)>0: | |
word_was_found = False | |
word_len = wl | |
while word_len > 1: | |
p = self._get_word_simplified(w[:word_len]) | |
if p: | |
simplified += p | |
w = w[word_len:] | |
word_was_found = True | |
break | |
word_len -= 1 | |
if word_was_found == False: | |
p = self._get_char_simplified(w[0]) | |
if p: | |
simplified += p | |
else: | |
#add character directly. | |
simplified+=w[0] | |
w = w[1:] | |
return simplified | |
#!/Changes! | |
def get_definitions(self, w, lang): | |
'''Returns all definitions for a given language. | |
Lang should be one of en, de, fr, es. | |
returns a list of | |
each one in the format (pinyin, definition, classifier, alt_spelling) | |
''' | |
langs = {"en":"english", "de":"german", "fr":"french", "es":"spanish"} | |
self.c.execute("select distinct pinyin, %s as definition, classifiers, alternates from cidian where (traditional=? or simplified=?) and length(definition)>0 order by pinyin;" % langs[lang], (w, w)) | |
try: | |
return self.c.fetchall() | |
except: | |
return [] | |
def get_classifiers(self, txt): | |
r = [] | |
self.c.execute("select distinct classifiers from cidian where (traditional=? or simplified=?);", (txt, txt)) | |
try: | |
#fetchall returns a list of tuples, converts to a list of strings | |
return filter(lambda a:a, map(lambda a:a[0], self.c.fetchall())) | |
except: | |
return [] | |
def get_alt_spellings(self, txt): | |
self.c.execute("select distinct alternates from cidian where (traditional=? or simplified=?);", (txt, txt)) | |
try: | |
#fetchall returns a list of tuples, converts to a list of strings | |
return filter(lambda a:a, map(lambda a:a[0], self.c.fetchall())) | |
except: | |
return [] | |
def add_with_space(a, b): | |
if len(a)>0 and " " != a[-1]: | |
return a+" "+b | |
return a+b | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Welcome to the Chinese Support Add-on's field edition ruleset. | |
# Here, you can tweak the note editor helper's behavior to your liking. | |
# | |
# If you messed things up, you can safely delete file | |
# addons/chinese/edit_behavior.py from your Anki directory. | |
# It will be recreated the next time you restart Anki. | |
# | |
# You can read about all available functions at: | |
# https://github.com/ttempe/chinese-support-addon/wiki/Edit-behavior | |
# Also, see the Python tutorial at http://docs.python.org/2/tutorial | |
""" Note that I've changed almost everything in this file to my liking. I have cleaned up the code into individual functions. | |
I couldn't figure out what the hell the 1st Case was doing. But I never used the given models anyway. | |
Basically, I've changed the fields to allow pinyin, pinyin (taiwan), cantonese and bopomofo to all be able to be | |
used on the same deck. Additionally, English, German and French translations can be given on the same deck. | |
The problem with having a "setting" change the transcription is that people who are studying both Cantonese and Mandarin, | |
or who want to use both Pinyin and Bopomofo keep having to change the settings. This way, the setting is automatic per deck. | |
However, I have left the option to use the setting chosen if so desired. A few other things have been changed as well. | |
Finally, I have also added "Simplified/TraditionalInclusive" fields where the the simplified/traditional is given even if it | |
is the same as the hanzi field. (Useful if you are mixing simplified and traditional forms for different cards in the deck, | |
but still want to display both the simplified and traditional without weird spacing issues when either the simplified or | |
traditional forms are the same as given. | |
""" | |
from config import chinese_support_config | |
from edit_functions import * | |
from sentDict import sentDict | |
anki1_model_names = ["Chinese", "chinese", "Mandarin Vocab"] #"Mandarin" is used by the Pinyin Toolkit port by Chris Hatch | |
Hanzi_fields = ["Expression", "Hanzi", "Chinese", u"汉字", u"漢字", u"中文"] | |
Color_fields = ["Color", "Colour", "Colored Hanzi", u"彩色"] | |
ColorPY_fields = ["ColorPY", "ColourPY"] | |
ColorPYTW_fields = ["ColorPYTW", "ColourPYTW"] | |
ColorCANT_fields = ["ColorCANT", "ColourCANT"] | |
ColorBPMF_fields = ["ColorBPMF", "ColourBPMF"] | |
Transcription_fields = ["Reading"] | |
Pinyin_fields = ["Pinyin", "PY", u"拼音", u"大陆拼音", u"大陸拼音"] | |
PinyinTW_fields = ["PinyinTW", "PYTW", u"臺灣拼音", u"台灣拼音", u"台湾拼音"] | |
Cantonese_fields = ["Cantonese", "Jyutping", u"廣東話", u"广东话", u"粵語", u"粤语",u"廣州話", u"广州话", u"粵", u"粤", u"粵拼", u"粤拼"] | |
Bopomofo_fields = [u"注音符號" u"註音符號", u"注音符号", "Bopomofo", u"ㄅㄆㄇㄈ"] | |
Meaning_fields = ["Meaning", "Definition", u"意思", u"翻译", u"翻譯", u"解释", u"解釋"] | |
English_fields = ["English", u"英语", u"英語", u"英文"] | |
German_fields = ["German", "Deutsch", u"德语", u"德語", u"德文"] | |
French_fields = ["French", "le français", u"法语", u"法語", u"法文"] | |
Sound_fields = ["Audio", "Sound", "Spoken", u"声音", u"聲音"] | |
#Excludes simplified/traditional pairs which are the same | |
Simplified_fields = ["Simplified", "Simp", "Simp.", u"简体", u"簡體", u"简化", u"簡化", u"简体字", u"簡體字", u"简化字", u"簡化字"] | |
Traditional_fields = ["Traditional", "Trad", "Trad.", u"繁体", u"繁體", u"繁体字", u"繁體字"] | |
#Includes simplified/traditional pairs which are the same | |
SimplifiedInclusive_fields = ["Simplified Inclusive"] | |
TraditionalInclusive_fields = ["Traditional Inclusive"] | |
Mean_Word_fields = ["Mean Word", "Measure Word", "MW", "Mean", "Classifier", u"量词", u"量詞"] | |
Alternate_fields = ["Also writted", "Alt", "Alternate"] | |
Ruby_fields = ["Ruby"] | |
RubyPY_fields = ["RubyPY"] | |
RubyPYTW_fields = ["RubyPYTW"] | |
RubyCANT_fields = ["RubyCANT"] | |
RubyBPMF_fields = ["RubyBPMF"] | |
Silhouette_fields = ["Silhouette"] | |
Sentence_fields = ["Usage", "Sentences", "Usage"] | |
def update_fields(field, updated_field, model_name, model_type): | |
#1st case : the new Ruby-based model | |
if model_type == "Chinese Ruby": | |
if updated_field == "Hanzi": | |
#Update the ruby | |
h = colorize(ruby(accentuate_pinyin(field["Hanzi"]))) | |
#Add the toneless transcription and hanzi, hidden, | |
#to make them searchable | |
h = hide_ruby(h) | |
field["Hanzi"] = h | |
if field["Hanzi"] == "": | |
field["Meaning"] = "" | |
elif field["Meaning"] == "": | |
field["Meaning"] = translate( field["Hanzi"] ) | |
elif updated_field[0:5] == "Hanzi":#Field name starts with "Hanzi" | |
field[updated_field] = \ | |
colorize( ruby( accentuate_pinyin( field[updated_field] ) ) ) | |
#2nd case : use the old Anki1 Pinyin-toolkit rule1s if the deck is | |
#called "Chinese" or was created as "Chinese (compatibility)" from | |
#Anki2. | |
#Note that we accept multiple field names for each field, to ensure | |
#Anki1 compatibility. | |
else: | |
#Define Functions | |
def update_Meaning_fields(): | |
#Update Meaning field only if empty. | |
m = "" | |
if get_any(Meaning_fields, field) == "" : | |
m = translate(field[updated_field]) | |
#If there's no mean word field, then add it here | |
if not has_field(Mean_Word_fields, field): | |
mw = get_mean_word(field[updated_field]) | |
if mw: | |
m += "<br>Cl: "+mw | |
#If there's no alt spelling field, then add it here | |
if not has_field(Alternate_fields, field): | |
mw = get_alternate_spellings(field[updated_field]) | |
if mw: | |
m += "<br>Also written: "+mw | |
set_all(Meaning_fields, field, to = m) | |
#Translate to English | |
m = "" | |
if get_any(English_fields, field) == "" : | |
m = translate(field[updated_field], "zh", "local_en") | |
#If there's no mean word field, then add it here | |
if not has_field(Mean_Word_fields, field): | |
mw = get_mean_word(field[updated_field]) | |
if mw: | |
m += "<br>Cl: "+mw | |
#If there's no alt spelling field, then add it here | |
if not has_field(Alternate_fields, field): | |
mw = get_alternate_spellings(field[updated_field]) | |
if mw: | |
m += "<br>Also written: "+mw | |
set_all(English_fields, field, to = m) | |
#Translate to German | |
m = "" | |
if get_any(German_fields, field) == "" : | |
m = translate(field[updated_field], "zh", "local_de") | |
#If there's no mean word field, then add it here | |
if not has_field(Mean_Word_fields, field): | |
mw = get_mean_word(field[updated_field]) | |
if mw: | |
m += "<br>Cl: "+mw | |
#If there's no alt spelling field, then add it here | |
if not has_field(Alternate_fields, field): | |
mw = get_alternate_spellings(field[updated_field]) | |
if mw: | |
m += "<br>Also written: "+mw | |
set_all(German_fields, field, to = m) | |
#Translate to French | |
m = "" | |
if get_any(French_fields, field) == "" : | |
m = translate(field[updated_field], "zh", "local_fr") | |
#If there's no mean word field, then add it here | |
if not has_field(Mean_Word_fields, field): | |
mw = get_mean_word(field[updated_field]) | |
if mw: | |
m += "<br>Cl: "+mw | |
#If there's no alt spelling field, then add it here | |
if not has_field(Alternate_fields, field): | |
mw = get_alternate_spellings(field[updated_field]) | |
if mw: | |
m += "<br>Also written: "+mw | |
set_all(French_fields, field, to = m) | |
return | |
def update_Mean_Word_fields(): | |
#Update Mean word field only if empty. | |
if get_any(Mean_Word_fields, field) == "" : | |
m = get_mean_word(field[updated_field]) | |
set_all(Mean_Word_fields, field, to = m) | |
return | |
def update_Alternative_fields(): | |
#Update alt spelling field only if empty. | |
if get_any(Alternate_fields, field) == "" : | |
m = get_alternate_spellings(field[updated_field]) | |
set_all(Alternate_fields, field, to = m ) | |
return | |
def update_Silhouette_fields(): | |
m = silhouette(get_any(Hanzi_fields,field)) | |
set_all(Silhouette_fields, field, to = m) | |
return | |
def update_all_Transcription_fields(): | |
#Update transcription fields | |
#Only if it's empty | |
if get_any(Transcription_fields, field) == "" : | |
t = colorize( transcribe( no_sound( field[updated_field] ) ) ) | |
#Hide the unaccented transcription in the field, | |
#to make searching easier | |
t = hide(t, no_tone(t)) | |
set_all(Transcription_fields, field, to = t ) | |
if get_any(Pinyin_fields, field) == "" : | |
t = colorize( transcribe( no_sound( field[updated_field] ), "Pinyin") ) | |
t = hide(t, no_tone(t)) | |
set_all(Pinyin_fields, field, to = t ) | |
if get_any(PinyinTW_fields, field) == "" : | |
t = colorize( transcribe( no_sound( field[updated_field] ), "Pinyin (Taiwan)") ) | |
t = hide(t, no_tone(t)) | |
set_all(PinyinTW_fields, field, to = t ) | |
if get_any(Cantonese_fields, field) == "" : | |
t = colorize( transcribe( no_sound( field[updated_field] ), "Cantonese", False ) ) | |
t = hide(t, no_tone(t)) | |
set_all(Cantonese_fields, field, to = t ) | |
if get_any(Bopomofo_fields, field) == "" : | |
t = colorize( transcribe( no_sound( field[updated_field] ), "Bopomofo") ) | |
t = hide(t, no_tone(t)) | |
set_all(Bopomofo_fields, field, to = t ) | |
return | |
def update_all_Color_fields(): | |
#Update Color fields from the Hanzi field, | |
h = no_sound( get_any(Hanzi_fields,field)) | |
#Take the tone info from the Transcription field | |
t = no_sound( no_color(get_any(Transcription_fields, field) ) ) | |
c = colorize_fuse( h, t ) | |
set_all(Color_fields, field, to = c ) | |
#Take the tone info from the Pinyin field | |
t = no_sound( no_color(get_any(Pinyin_fields, field) ) ) | |
c = colorize_fuse( h, t ) | |
set_all(ColorPY_fields, field, to = c ) | |
set_all(Color_fields, field, to = c ) | |
#Take the tone info from the PinyinTW field | |
t = no_sound( no_color(get_any(PinyinTW_fields, field) ) ) | |
c = colorize_fuse( h, t ) | |
set_all(ColorPYTW_fields, field, to = c ) | |
#Take the tone info from the Cantonese field | |
t = no_sound( no_color(get_any(Cantonese_fields, field) ) ) | |
#z = no_sound( get_any(Traditional_fields,field)) | |
if get_any(Traditional_fields, field) == "": | |
h = traditional(h) | |
c = colorize_fuse( h, t ) | |
set_all(ColorCANT_fields, field, to = c ) | |
#Take the tone info from the Bopomofo field | |
t = no_sound( no_color(get_any(Bopomofo_fields, field) ) ) | |
c = colorize_fuse( h, t ) | |
set_all(ColorBPMF_fields, field, to = c ) | |
return | |
def update_Sound_fields(): | |
#Update Sound field from Hanzi field if non-empty | |
#(only if field actually exists, as it implies downloading | |
#a soundfile from Internet) | |
if has_field(Sound_fields, field) and \ | |
get_any(Sound_fields, field)=="": | |
set_all(Sound_fields, field, to = sound(field[updated_field])) | |
return | |
def update_all_Simplified_Traditional_fields(): | |
#Update simplified/traditional fields | |
s = simplify(field[updated_field]) | |
set_all(SimplifiedInclusive_fields, field, to = s ) | |
if s <> field[updated_field]: | |
set_all(Simplified_fields, field, to = s ) | |
else: | |
set_all(Simplified_fields, field, to = "" ) | |
t = traditional(field[updated_field]) | |
set_all(TraditionalInclusive_fields, field, to = t ) | |
if t <> field[updated_field]: | |
set_all(Traditional_fields, field, to = t ) | |
else: | |
set_all(Traditional_fields, field, to = "" ) | |
return | |
def update_all_Ruby_fields(): | |
#Update ruby fields | |
#m = colorize_fuse(get_any(Hanzi_fields, field), get_any(Transcription_fields, field), ruby=True) | |
#set_all(Ruby_fields, field, to = m) | |
m = colorize_fuse(get_any(Hanzi_fields, field), get_any(Pinyin_fields, field), ruby=True) | |
set_all(RubyPY_fields, field, to = m) | |
set_all(Ruby_fields, field, to = m) | |
m = colorize_fuse(get_any(Hanzi_fields, field), get_any(PinyinTW_fields, field), ruby=True) | |
set_all(RubyPYTW_fields, field, to = m) | |
m = colorize_fuse(get_any(Hanzi_fields, field), get_any(Bopomofo_fields, field), ruby=True) | |
set_all(RubyBPMF_fields, field, to = m) | |
m = colorize_fuse(get_any(Hanzi_fields, field), get_any(Cantonese_fields, field), ruby=True) | |
if get_any(Traditional_fields, field) != "": | |
m = traditional(m) | |
set_all(RubyCANT_fields, field, to = m) | |
return | |
def update_sentence_fields(): | |
u = sentDict() | |
if get_any(Sentence_fields, field) == "": #and field[updated_field] != "": | |
u = u.senticize(field[updated_field],1000) | |
set_all(Sentence_fields, field, to = u) | |
def erase_fields(): | |
#Erase other fields if the updated field was emptied | |
if field[updated_field]=="": | |
set_all(Meaning_fields, field, to="") | |
set_all(English_fields, field, to="") | |
set_all(German_fields, field, to="") | |
set_all(French_fields, field, to="") | |
set_all(Transcription_fields, field, to="") | |
set_all(Pinyin_fields, field, to="") | |
set_all(PinyinTW_fields, field, to="") | |
set_all(Cantonese_fields, field, to="") | |
set_all(Bopomofo_fields, field, to="") | |
set_all(Sound_fields, field, to="") | |
set_all(Simplified_fields, field, to="") | |
set_all(Traditional_fields, field, to="") | |
set_all(SimplifiedInclusive_fields, field, to="") | |
set_all(TraditionalInclusive_fields, field, to="") | |
set_all(Mean_Word_fields, field, to="") | |
set_all(Alternate_fields, field, to="") | |
set_all(Ruby_fields, field, to="") | |
set_all(RubyPY_fields, field, to="") | |
set_all(RubyPYTW_fields, field, to="") | |
set_all(RubyCANT_fields, field, to="") | |
set_all(RubyBPMF_fields, field, to="") | |
set_all(Silhouette_fields, field, to="") | |
set_all(Sentence_fields, field, to="") | |
return | |
#Fields to update after the Hanzi field has been modified: | |
if updated_field in Hanzi_fields: | |
erase_fields() | |
update_Meaning_fields() | |
update_Mean_Word_fields() | |
update_Alternative_fields() | |
update_Silhouette_fields() | |
update_all_Transcription_fields() | |
update_all_Color_fields() | |
update_Sound_fields() | |
update_all_Simplified_Traditional_fields() | |
update_all_Ruby_fields() | |
update_sentence_fields() | |
#If the transcription was modified, update the Color field | |
elif updated_field in Transcription_fields: | |
t = colorize(accentuate_pinyin(separate_pinyin(no_color(field[updated_field])))) | |
t = hide(t, no_tone(t)) | |
field[updated_field] = t | |
update_all_Color_fields() | |
update_all_Ruby_fields() | |
elif updated_field in Pinyin_fields: | |
t = colorize(accentuate_pinyin(separate_pinyin(no_color(field[updated_field]), True), True)) | |
t = hide(t, no_tone(t)) | |
field[updated_field] = t | |
update_all_Color_fields() | |
update_all_Ruby_fields() | |
elif updated_field in PinyinTW_fields: | |
t = colorize(accentuate_pinyin(separate_pinyin(no_color(field[updated_field]), True), True)) | |
t = hide(t, no_tone(t)) | |
field[updated_field] = t | |
#Also update Bopomofo | |
set_all(Bopomofo_fields, field, to=pinyin_to_bopomofo(t)) | |
update_all_Color_fields() | |
update_all_Ruby_fields() | |
elif updated_field in Cantonese_fields: | |
t = colorize(separate_pinyin(no_color(field[updated_field]), True, True)) | |
t = hide(t, no_tone(t)) | |
field[updated_field] = t | |
update_all_Color_fields() | |
update_all_Ruby_fields() | |
elif updated_field in Bopomofo_fields: | |
t = no_color(field[updated_field]) | |
t = hide(t, no_tone(t)) | |
field[updated_field] = t | |
update_all_Color_fields() | |
update_all_Ruby_fields() | |
#If the traditional/simplified inclusive fields were modified, update the others | |
elif updated_field in SimplifiedInclusive_fields: | |
s = field[updated_field] | |
if s == "" : | |
s = get_any(Hanzi_fields, field) | |
if s <> get_any(Hanzi_fields, field) : | |
set_all(Simplified_fields, field, to = s ) | |
else: | |
set_all(Simplified_fields, field, to = "" ) | |
elif updated_field in Simplified_fields: | |
s = field[updated_field] | |
if s == "" : | |
set_all(SimplifiedInclusive_fields, field, to = get_any(Hanzi_fields, field)) | |
else: | |
set_all(SimplifiedInclusive_fields, field, to = s ) | |
elif updated_field in TraditionalInclusive_fields: | |
t = field[updated_field] | |
if t == "" : | |
t = get_any(Hanzi_fields, field) | |
if t <> get_any(Hanzi_fields, field) : | |
set_all(Traditional_fields, field, to = t ) | |
else: | |
set_all(Traditional_fields, field, to = "" ) | |
elif updated_field in Simplified_fields: | |
t = field[updated_field] | |
if t == "" : | |
set_all(TraditionalInclusive_fields, field, to = get_any(Hanzi_fields, field)) | |
else: | |
set_all(TraditionalInclusive_fields, field, to = t ) | |
return field |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# | |
# Copyright © 2012 Thomas TEMPÉ, <thomas.tempe@alysse.org> | |
# | |
# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html | |
from aqt import mw | |
import re | |
from config import chinese_support_config | |
import bopomofo as bopomofo_module | |
import google_tts | |
import baidu_tts | |
from microsofttranslator import Translator as MSTranslator | |
import dictdb | |
# Essential Edit functions | |
################################################################## | |
# | |
# You may call any of these functions from the edit_behavior.py file. | |
def colorize(text, ruby_whole=False): | |
u'''Add tone color info. | |
(can be seen in the card preview, but not the note edit view). | |
Works on transcription, hanzi or ruby. | |
In the case of ruby, it will colorize only the annotation by default. | |
If ruby_whole = True, then it will colorize the whole character. | |
Warning : it's not recommended to use this function on hanzi directly, | |
since it cannot choose the correct color in the case of | |
多音字 (characters with multiple pronunciations).''' | |
text = no_color(text) | |
(text, sound_tags) = extract_sound_tags(text) | |
def colorize_hanzi_sub(p): | |
return u'<span class="tone{t}">{r}</span>'.format(t=get_tone_number(transcribe(p.group(1), only_one=True)), r=p.group()) | |
def colorize_pinyin_sub(p): | |
pinyin = p.group() | |
if pinyin[0] in '&<"/': | |
return pinyin | |
else: | |
return u'<span class="tone{t}">{r}</span>'.format(t=get_tone_number(p.group(1)), r=pinyin) | |
if has_ruby(text): #Treat like ruby | |
if ruby_whole: | |
def colorize_ruby_sub(p): | |
return u'<span class="tone{t}">{r}</span>'.format(t=get_tone_number(p.group(2)), r=p.group()) | |
text = re.sub(u'([\u3400-\u9fff]\[\s*)([a-zü'+accents+u']+1?[0-9¹²³⁴]?)(.*?\])', colorize_ruby_sub, text, flags=re.I) | |
else: | |
text = re.sub(u'([a-zü'+accents+u']+1?[0-9¹²³⁴]?)', colorize_pinyin_sub, text, flags=re.I) | |
elif has_hanzi(text): | |
text = re.sub(u'([\u3400-\u9fff])', colorize_hanzi_sub, text) | |
else: | |
text = re.sub(u'([&<"/]?[a-zü'+accents+u']+1?[0-9¹²³⁴]?)', colorize_pinyin_sub, text, flags=re.I) | |
text = text+sound_tags | |
return text | |
def ruby_top(txt): | |
"Extract the top (pronunciation) part of a ruby string." | |
r = r' ?([^ >]+?)\[(.+?)\]' | |
return re.sub(r, r'\2 ', no_sound(txt)) | |
def ruby_bottom(txt): | |
"Extract the bottom part of a ruby string." | |
r = r' ?([^ >]+?)\[(.+?)\]' | |
text = re.sub(r, r'\1 ', no_sound(txt)) | |
return text | |
def no_color(text): | |
"Remove tone color info and other HTML pollutions" | |
if text == None: | |
return "" | |
text = text.replace(r' ', '') | |
text = no_hidden(text) | |
#remove color info | |
text = re.sub(r'<span class="tone1?[0-9]">(.*?)</span>', r'\1', text) | |
#remove black font tag sometimes added by Anki | |
text = re.sub(r'<font color="#000000">(.*?)</font>', r'\1', text) | |
return text | |
def hide(text, hidden): | |
"""Add hidden keyword to string (typically Hanzi and toneless pinyin), | |
to make a note searchable in the 'browse' window | |
""" | |
if len(text) == 0 or text == "<br />": | |
return "" | |
hidden = no_color(hidden) | |
hidden = hidden.replace("<.*?>", "") | |
hidden = hidden.replace(r"[<!->]", "") | |
return text + "<!--"+hidden+"-->" | |
def hide_ruby(text): | |
"""Append hidden hanzi and toneless pinyin to a ruby string, | |
to make a note searchable in the 'browse' window. | |
""" | |
t = no_tone(ruby_top(text)) | |
t += no_color(ruby_bottom(text)).replace(" ", "") | |
return hide(text, t) | |
def silhouette(hanzi): | |
"""Replaces each Chinese character by a blank space. | |
Eg: 以A为B -> _A_B | |
Eg: 哈密瓜 -> _ _ _ | |
""" | |
def insert_spaces(p): | |
r = "" | |
for i in p.group(0): | |
r += i + " " | |
return r[:-1] | |
hanzi = re.sub(u"[\u3400-\u9fff]+", insert_spaces, hanzi) | |
txt = re.sub(u"[\u3400-\u9fff]", "_", hanzi) | |
return txt | |
def no_hidden(text): | |
"""Remove hidden keyword string""" | |
return re.sub(r"<!--.*?-->", "", text) | |
def accentuate_pinyin(text, force=False): | |
u'''Add accents to pinyin. | |
Eg: ni2 becomes ní. | |
Eg: ní4 becomes nì. (to make correction easier) | |
Does nothing if the default transcription is not Pinyin or Pinyin (Taiwan), | |
unless force=True. | |
Nota : also removes coloring. If you want color, please add it last. | |
''' | |
def accentuate_pinyin_sub(p): | |
pinyin = p.group(1) | |
tone = p.group(2) | |
if "tone"==pinyin: | |
return pinyin+tone | |
# for v in accents: | |
# re.sub(v, base_letters[v], pinyin) | |
pinyin = no_tone(pinyin) | |
for v in u"aeiouüvAEIOUÜV": | |
if pinyin.find(v)>-1: | |
try: | |
return re.sub(v, vowel_decorations[int(tone)][v.lower()], pinyin, count=1) | |
except KeyError, IndexError: | |
pass | |
return pinyin | |
if chinese_support_config.options['transcription'] \ | |
not in ['Pinyin', 'Pinyin (Taiwan)'] and not force: | |
return text | |
text = no_color(text) | |
text = re.sub(u'([a-z]*[aeiouüÜv'+accents+u'][a-zü]*)([1-5])', accentuate_pinyin_sub, text, flags=re.I) | |
return text | |
#!Changes! (There was a problem with 5th tone in pinyin not being converted to 5. This caused problems with bopomofo colouring | |
#because the bopomofo conversion could not put the appropriate symbol for 5th tone (since no '5' was given) and it appeared to the | |
#colouriser to be a 1st tone (since in bopomofo, 1st tone has no symbol, like how pinyin has no symbol for 5th tone) | |
#Adding the unaccented vowels removed this problem. | |
def no_accents(text): | |
u'Eg: ní becomes ni2.' | |
def desaccentuate_pinyin_sub(p): | |
return ""+p.group(1)+base_letters[p.group(2).lower()]+p.group(3)+get_tone_number(p.group(2).lower()) | |
#Remove +u'aeiouüvAEIOUÜV' if you want 5th tone to be ignored | |
return re.sub(u'([a-zü]*)(['+u'aeiouüvAEIOUÜV'+accents+u'])([a-zü]*)', desaccentuate_pinyin_sub, text, flags=re.I) | |
#!/Changes! | |
def ruby(text, transcription=None, only_one=False, try_dict_first=True): | |
u'''Convert hanzi to ruby notation, eg: '你' becomes '你[nǐ]'. | |
This can in turn be used with the {{Ruby:fieldname}} card template, | |
to generate beautiful ruby-annotated cards. | |
If not specified, use the transcription type set in the menubar (eg pinyin). | |
if try_dict_first, looks up sequences of characters in the | |
selected words dictionary to supply a better transcription. | |
If not specified, insert all possible pinyin words for characters not found | |
in words dictionary. | |
''' | |
if transcription == None: | |
transcription = chinese_support_config.options['transcription'] | |
#Replace Chinese typography with its ASCII counterpart | |
text = re.sub(u'[[【]', u'[', text) | |
text = re.sub(u'[]】]', u']', text) | |
#Strip former HTML tone marking and comments | |
text = no_color(text) | |
text = no_sound(text) | |
#Make sure sound tag isn't confused with Hanzi | |
text = re.sub(u'([\u3400-\u9fff])(\[sound:)', r'\1 \2', text) | |
def insert_multiple_pinyin_sub(p): | |
hanzi=p.group(1) | |
transc = db.get_pinyin(hanzi) | |
if not transc: | |
return p.group() | |
transc = transc.split(" ") | |
ret = "" | |
hanzi = p.group(1) | |
while len(hanzi): | |
if "Pinyin" == transcription: | |
ret += hanzi[0] + "["+transc.pop(0)+"]" | |
elif "Bopomofo" == transcription: | |
ret += hanzi[0] + "[" | |
ret += bopomofo_module.bopomofo(no_accents(transc.pop(0)))+"]" | |
hanzi = hanzi[1:] | |
return ret+p.group(2) | |
def insert_pinyin_sub(p): | |
return p.group(1)+'['+get_character_transcription(p.group(1), transcription, only_one)+']'+p.group(2) | |
text += '%' | |
if try_dict_first and transcription in ["Pinyin", "Bopomofo"]: | |
text = re.sub(u'([\u3400-\u9fff]+)([^[])', insert_multiple_pinyin_sub, text) | |
text = re.sub(u'([\u3400-\u9fff])([^[])', insert_pinyin_sub, text) | |
text = re.sub(u'([\u3400-\u9fff])([^[])', insert_pinyin_sub, text) | |
text = text[:-1] | |
text += sound(text) | |
return text | |
def no_tone(text): | |
u'''Removes tone information and coloring. | |
Eg: 'ni3' becomes 'ni', 'má' becomes 'ma' | |
''' | |
text = no_color(text) | |
text = no_accents(text) | |
def no_tone_marks_sub(p): | |
return ""+p.group(1)+re.sub(r'1?[0-9¹²³⁴]', '', p.group(2))+"]" | |
if has_ruby(text): | |
text = re.sub(u'([\u3400-\u9fff]\[)([^[]+?)\]', no_tone_marks_sub, text) | |
else: | |
text = re.sub(u'([a-zü]+)1?[0-9¹²³⁴]', r'\1', text) | |
return text | |
def hanzi(text): | |
u'''Returns just the anzi from a Ruby notation. | |
Eg: '你[nǐ][You]' becomes '你'. | |
''' | |
text = re.sub(u'([\u3400-\u9fff])(\[[^[]+?\])', r'\1', text) | |
text = re.sub(r'\[sound:.[^[]+?\]', '', text) | |
text = re.sub(r'([^\u3400-\u9fff])\[[^[]+?\]\s*$', r'\1', text) | |
return text | |
def transcribe(text, transcription=None, only_one=True): | |
u''' | |
Converts to specified transcription. | |
Eg : 你 becomes nǐ (transcription="Pinyin", only_one=True) | |
Pinyin, Taiwan Pinyin and Bopomofo: lookup in local words dictionaries | |
first, and use characters dictionary as a backup. | |
If no transcription is specified, use the transcription set in the menu. | |
''' | |
text = cleanup(text) | |
if text == "": | |
return "" | |
if None == transcription: | |
transcription = chinese_support_config.options["transcription"] | |
if "Pinyin" == transcription: | |
r = db.get_pinyin(text, taiwan=False) | |
elif "Pinyin (Taiwan)" == transcription: | |
r = db.get_pinyin(text, taiwan=True) | |
elif "Cantonese" == transcription: | |
r = db.get_cantonese(text, only_one) | |
elif "Bopomofo" == transcription: | |
r = db.get_pinyin(text, taiwan=True) | |
r = bopomofo_module.bopomofo(no_accents(r)) | |
else: | |
r = "" | |
return r | |
#!Changes! (I've added an extra function which converts pinyin to bopomofo) | |
def pinyin_to_bopomofo(pinyin): | |
u''' | |
Converts Pinyin to Bopomofo. | |
''' | |
return bopomofo_module.bopomofo(no_accents(cleanup(pinyin))) | |
#!/Changes! | |
#Should this function here be removed? It appears to do nothing and get_alternate_spellings is defined later on. | |
def get_alt(text): | |
"""Returns alternate spelling of Chinese expression""" | |
def translate_local(text, lang): | |
"""Translate using local dictionary. | |
lang is one of "en", "fr", "de", "es" | |
""" | |
defs = db.get_definitions(text, lang) | |
if 0 == len(defs): | |
return "" | |
def are_there_multiple_pinyins(defs): | |
(prev_p, a, b, c)= defs[0] | |
for (pinyin, definition, cl, alt) in defs: | |
if pinyin<>prev_p: | |
return True | |
return False | |
res = "" | |
if are_there_multiple_pinyins(defs): | |
for (pinyin, definition, cl, alt) in defs: | |
res += u"❖ %s[%s] %s\n" % (text, pinyin, definition) | |
else: | |
for (pinyin, definition, cl, alt) in defs: | |
res += " \t"+definition+"\n" | |
res = res.replace("\n", "\n<br>") | |
res = local_dict_colorize(res) | |
return res | |
def translate(text, from_lang="zh", to_lang=None, progress_bar=True): | |
u'''Translate to a different language. | |
Eg: '你好' becomes 'Hello' | |
Only installed dictionaries can be used. | |
to_lang possible values : "local_en", "local_de", "local_fr" | |
or a 2-letter ISO language code for MS Translate | |
if to_lang is unspecified, the default language will be used. | |
if progress_bar is True, then will display a progress bar. | |
''' | |
global MS_translator_object | |
text = cleanup(text) | |
if "" == text: | |
return "" | |
if None == to_lang: | |
to_lang = chinese_support_config.options["dictionary"] | |
if "None" == to_lang: | |
return "" | |
if to_lang.startswith("local_"): #Local dict | |
return translate_local(text, to_lang[-2:]) | |
else: #Ms translate | |
ret = "" | |
if progress_bar: | |
mw.progress.start(label="MS Translator lookup", immediate=True) | |
if None == MS_translator_object: | |
MS_translator_object = MSTranslator("chinese-support-add-on", "Mh+X5YY17LZZ8rO9hzJXYD3I02V3E+ltItF15ep7qG8=") | |
try: | |
ret = MS_translator_object.translate(text, to_lang) | |
except: | |
pass | |
if "ArgumentException:" == ret[:18]: | |
#Token has probably expired | |
ret="" | |
if progress_bar: | |
mw.progress.finish() | |
return ret | |
def cleanup(txt): | |
if not txt: | |
return "" | |
txt = re.sub(r"<.*?>", "", txt, flags=re.S) | |
txt = txt.replace(" ", " ") | |
txt = re.sub(r"^\s*", "", txt) | |
txt = re.sub(r"\s*$", "", txt) | |
# txt = re.sub(r"[\s+]", " ", txt) | |
return txt | |
def colorize_fuse(hanzi, pinyin, ruby=False): | |
u'''Gives color to a Hanzi phrase based on the tone info from a | |
corresponding Pinyin phrase. | |
If ruby = True, then annotate with pinyin on top of each character | |
Eg: "你好" and "ni3 hao3" -> 你好 (both colorized as 3rd tone). | |
''' | |
pinyin = cleanup(no_color(pinyin))+" "*len(hanzi) | |
hanzi = cleanup(hanzi) | |
text = "" | |
# print hanzi, "\t", pinyin | |
for h in hanzi: | |
if len(pinyin)<5: | |
pinyin = pinyin+" " | |
if has_hanzi(h): | |
[p, pinyin] = pinyin.split(" ", 1) | |
# print "C1\t", h, "\t", p | |
if ruby: | |
text += u'<span class="tone{t}"><ruby>{h}<rt>{p}</rt></span>'.format(t=get_tone_number(p), h=h, p=p) | |
else: | |
text += u'<span class="tone{t}">{h}</span>'.format(t=get_tone_number(p), h=h) | |
elif " "==h and " "!=pinyin[0]: | |
text += " " | |
# print "C2\t_\t(none)" | |
else: | |
# print "C3\t", h, "\t", pinyin[0] | |
text += pinyin[0] | |
pinyin = pinyin[1:] | |
if " " == pinyin[0]: | |
pinyin = pinyin[1:] | |
return text | |
def pinyin(text): | |
return transcribe(text, transcription="Pinyin") | |
def get_mean_word(text): | |
if text == "": | |
return "" | |
cl = db.get_classifiers(text) | |
if len(cl): | |
return local_dict_colorize(", ".join(cl)) | |
else: | |
return "" | |
def get_alternate_spellings(text): | |
if text == "": | |
return "" | |
alt = db.get_alt_spellings(text) | |
if len(alt): | |
return local_dict_colorize(", ".join(alt)) | |
else: | |
return "" | |
def sound(text, source=None): | |
''' | |
Returns sound tag for a given Hanzi string. | |
If the sound does not already exist in the media directory, then | |
attempt to obtain it from the specified source. | |
if the specified source is omitted, use the one selected in the | |
tools menu. | |
If it fails (eg: no network connexion while trying to retrieve | |
speech from Google TTS), return empty string. | |
Does not work with pinyin or other transcriptions. | |
''' | |
text = cleanup(text) | |
if None==source: | |
source = chinese_support_config.options['speech'] | |
text = no_color(no_accents(no_sound(text))) | |
text = re.sub("<.*?>", "", text) | |
if has_ruby(text): | |
text = hanzi(text) | |
if "" == text: | |
return "" | |
if "Google TTS Mandarin" == source: | |
try: | |
return "[sound:"+google_tts.get_word_from_google(text)+"]" | |
except: | |
return "" | |
elif "Baidu Translate" == source: | |
try: | |
return "[sound:"+baidu_tts.get_word_from_baidu(text)+"]" | |
except: | |
return "" | |
else: | |
return "" | |
def check_for_sound(text): | |
''' | |
Returns True if the soundfile arleady exists in the user's resources directory. | |
''' | |
text = cleanup(text) | |
text = no_color(no_accents(no_sound(text))) | |
text = re.sub("<.*?>", "", text) | |
if has_ruby(text): | |
text = hanzi(text) | |
if "" == text: | |
return False | |
if google_tts.check_resources(text): | |
return True | |
return False | |
def get_any(fields, dico): | |
u'''Get the 1st valid field from a list | |
Scans all field names listed as "fields", to find one that exists, | |
then returns its value. | |
If none exists, returns an empty string. | |
Case-insensitive. | |
''' | |
for f in fields: | |
for k, v in dico.iteritems(): | |
try: | |
if unicode(f.lower()) == unicode(k.lower()): | |
return dico[k] | |
except: | |
pass | |
return "" | |
def set_all(fields, dico, to): | |
u'''Set all existing fields to the same value. | |
(Non-existing fields are ignored) | |
Case-insensitive. | |
''' | |
for f in fields: | |
for d, v in dico.iteritems(): | |
try: | |
if unicode(d.lower()) == unicode(f.lower()): | |
dico[d] = to | |
except: | |
pass | |
def has_field(fields, dico): | |
u''' | |
Check if one of the named fields exists in the field list | |
Case-insensitive. | |
''' | |
for d, v in dico.iteritems(): | |
for f in fields: | |
try: | |
if unicode(f.lower()) == unicode(d.lower()): | |
return True | |
except: | |
pass | |
return False | |
def no_sound(text): | |
u''' | |
Removes the [sound:xxx.mp3] tag that's added by Anki when you record | |
sound into a field. | |
If you don't remove it before taking data from one field to another, | |
it will likely be duplicated, and the sound will play twice. | |
''' | |
return re.sub(r'\[sound:.*?]', '', text) | |
#!Changes! (I've added the ability to separate Jyutping as well) | |
def separate_pinyin(text, force=False, cantonese=False): | |
u""" | |
Separate pinyin syllables with whitespace. | |
Eg: "Yīlù píng'ān" becomes "Yī lù píng ān" | |
Does nothing if the default transcription is not Pinyin or Pinyin (Taiwan), | |
unless force="Pinyin" or force="Pinyin (Taiwan)" or force=True | |
Cantonese sets whether or not the text being separated is cantonese (if force=True). | |
Useful for people pasting Pinyin from Google Translate. | |
""" | |
if (chinese_support_config.options['transcription'] \ | |
in ['Pinyin', 'Pinyin (Taiwan)'] and not force) or (force and not cantonese): | |
def clean(t): | |
'remove leading apostrophe' | |
if "'" == t[0]: | |
return t[1:] | |
return t | |
def separate_pinyin_sub(p): | |
return clean(p.group("one"))+" "+clean(p.group("two")) | |
text = pinyin_two_re.sub(separate_pinyin_sub, text) | |
#text = pinyin_two_re.sub(separate_pinyin_sub, text) | |
return text | |
elif (chinese_support_config.options['transcription'] \ | |
in ['Cantonese'] and not force) or (force and cantonese): | |
def clean(t): | |
'remove leading apostrophe' | |
if "'" == t[0]: | |
return t[1:] | |
return t | |
def separate_jyutping_sub(p): | |
return clean(p.group("one"))+" "+clean(p.group("two")) | |
text = jyutping_two_re.sub(separate_jyutping_sub, text) | |
text = jyutping_two_re.sub(separate_jyutping_sub, text) | |
return text | |
else: | |
return text | |
#!/Changes! | |
def simplify(text): | |
u'''Converts to simplified variants | |
''' | |
r = db.get_simplified(text) | |
return r | |
def traditional(text): | |
u'''Converts to traditional variants | |
''' | |
r = db.get_traditional(text) | |
return r | |
# Extra support functions and parameters | |
################################################################## | |
MS_translator_object = None | |
vowel_tone_dict = { | |
u'ā':1, u'ā':1, u'ɑ̄':1, u'ē':1, u'ī':1, u'ō':1, u'ū':1, | |
u'ǖ':1, u'Ā':1, u'Ē':1, u'Ī':1, u'Ō':1, u'Ū':1, u'Ǖ':1, | |
u'á':2, u'ɑ́':2, u'é':2, u'í':2, u'ó':2, u'ú':2, u'ǘ':2, | |
u'Á':2, u'É':2, u'Í':2, u'Ó':2, u'Ú':2, u'Ǘ':2, | |
u'ǎ':3, u'ɑ̌':3, u'ě':3, u'ǐ':3, u'ǒ':3, u'ǔ':3, u'ǚ':3, | |
u'Ǎ':3, u'Ě':3, u'Ǐ':3, u'Ǒ':3, u'Ǔ':3, u'Ǚ':3, | |
u'à':4, u'ɑ̀':4, u'è':4, u'ì':4, u'ò':4, u'ù':4, u'ǜ':4, | |
u'À':4, u'È':4, u'Ì':4, u'Ò':4, u'Ù':4, u'Ǜ':4 | |
} | |
vowel_decorations = [ | |
{ }, | |
{ u'a':u'ā', u'e':u'ē', u'i':u'ī', u'o':u'ō', u'u':u'ū', u'ü':u'ǖ', u'v':u'ǖ'}, | |
{ u'a':u'á', u'e':u'é', u'i':u'í', u'o':u'ó', u'u':u'ú', u'ü':u'ǘ', u'v':u'ǘ'}, | |
{ u'a':u'ǎ', u'e':u'ě', u'i':u'ǐ', u'o':u'ǒ', u'u':u'ǔ', u'ü':u'ǚ', u'v':u'ǚ'}, | |
{ u'a':u'à', u'e':u'è', u'i':u'ì', u'o':u'ò', u'u':u'ù', u'ü':u'ǜ', u'v':u'ǜ'}, | |
{ u'a':u'a', u'e':u'e', u'i':u'i', u'o':u'o', u'u':u'u', u'ü':u'ü', u'v':u'ü'}, | |
] | |
base_letters = { | |
u'ā':u'a', u'ē':u'e', u'ī':u'i', u'ō':u'o', u'ū':u'u', u'ǖ':u'ü', | |
u'á':u'a', u'é':u'e', u'í':u'i', u'ó':u'o', u'ú':u'u', u'ǘ':u'ü', | |
u'ǎ':u'a', u'ě':u'e', u'ǐ':u'i', u'ǒ':u'o', u'ǔ':u'u', u'ǚ':u'ü', | |
u'à':u'a', u'è':u'e', u'ì':u'i', u'ò':u'o', u'ù':u'u', u'ǜ':u'ü', | |
u'a':u'a', u'e':u'e', u'i':u'i', u'o':u'o', u'u':u'u', u'ü':u'ü', | |
} | |
accents = u'ɑ̄āĀáɑ́ǎɑ̌ÁǍàɑ̀ÀēĒéÉěĚèÈīĪíÍǐǏìÌōŌóÓǒǑòÒūŪúÚǔǓùÙǖǕǘǗǚǙǜǛ' | |
def pinyin_re_sub(): | |
inits = u"zh|sh|ch|[bpmfdtnlgkhjqxrzscwy]" | |
finals = u"i[ōóǒòo]ng|[ūúǔùu]ng|[āáǎàa]ng|[ēéěèe]ng|i[āɑ̄áɑ́ɑ́ǎɑ̌àɑ̀aāáǎàa]ng|[īíǐìi]ng|i[āáǎàa]n|u[āáǎàa]n|[ōóǒòo]ng|[ēéěèe]r|i[āáǎàa]|i[ēéěèe]|i[āáǎàa]o|i[ūúǔùu]|[īíǐìi]n|u[āáǎàa]|u[ōóǒòo]|u[āáǎàa]i|u[īíǐìi]|[ūúǔùu]n|u[ēéěèe]|ü[ēéěèe]|v[ēéěèe]|i[ōóǒòo]|[āáǎàa]i|[ēéěèe]i|[āáǎàa]o|[ōóǒòo]u|[āáǎàa]n|[ēéěèe]n|[āáǎàa]|[ēéěèe]|[ōóǒòo]|[īíǐìi]|[ūúǔùu]|[ǖǘǚǜüv]" | |
standalones = u"'[āáǎàa]ng|'[ēéěèe]ng|'[ēéěèe]r|'[āáǎàa]i|'[ēéěèe]i|'[āáǎàa]o|'[ōóǒòo]u|'[āáǎàa]n|'[ēéěèe]n|'[āáǎàa]|'[ēéěèe]|'[ōóǒòo]" | |
return "(("+inits+")("+finals+")[1-5]?|("+standalones+")[1-5]?)" | |
pinyin_re = pinyin_re_sub() | |
pinyin_two_re = re.compile("(?P<one>"+pinyin_re+")(?P<two>"+pinyin_re+")", flags=re.I) | |
#!Changes! (These are the jyutping separating functions) | |
def jyutping_re_sub(): | |
inits = u"ng|gw|kw|[bpmfdtnlgkhwzcsj]" | |
finals = u"i|ip|it|ik|im|in|ing|iu|yu|yut|yun|u|up|ut|uk|um|un|ung|ui|e|ep|et|ek|em|en|eng|ei|eu|eot|eon|eoi|oe|oet|oek|oeng|oei|o|ot|ok|om|on|ong|oi|ou|ap|at|ak|am|an|ang|ai|au|aa|aap|aat|aak|aam|aan|aang|aai|aau|m|ng" | |
standalones = u"'uk|'ung|'e|'ei|'oe|'o|'ok|'om|'on|'ong|'oi|'ou|'ap|'at|'ak|'am|'an|'ang|'ai|'au|'aa|'aap|'aat|'aak|'aam|'aan|'aang|'aai|'aau|'m|'ng" | |
return "(("+inits+")("+finals+")[1-6]?|("+standalones+")[1-6]?)" | |
jyutping_re = jyutping_re_sub() | |
jyutping_two_re = re.compile("(?P<one>"+jyutping_re+")(?P<two>"+jyutping_re+")", flags=re.I) | |
#!/Changes! | |
db = dictdb.DictDB() | |
bopomofo_notes = { | |
u"ˊ":"2", u"ˇ":"3",u"ˋ":"4", u"˙":"5"} | |
def extract_sound_tags(text): | |
sound_tags = re.findall(r"\[sound:.*?\]", text) | |
if [] == sound_tags: | |
sound_tags="" | |
else: | |
sound_tags = reduce(lambda a,b:a+b, sound_tags) | |
nosound = re.sub(r"\[sound:.*?\]", r"", text) | |
return nosound, sound_tags | |
def get_tone_number(pinyin): | |
if re.match(r".+1[0-9]$", pinyin): | |
return pinyin[-2:] | |
elif re.match(r".+[0-9]$", pinyin): | |
return pinyin[-1:] | |
elif re.match(u".+[¹²³⁴]$", pinyin): | |
return str(u" ¹²³⁴".index(pinyin[-1:])) | |
elif re.match(u"[\u3100-\u312F]", pinyin):#Bopomofo | |
if re.match(u"[ˊˇˋ˙]", pinyin[-1:]): | |
return str(u" ˊˇˋ˙".index(pinyin[-1:])) | |
else: | |
return "1" | |
else: | |
for c in pinyin: | |
try: | |
return str(vowel_tone_dict[c]) | |
except KeyError: | |
continue | |
return "5" | |
def has_ruby(text): | |
return re.search(u"[\u3400-\u9fff]\[.+\]", text) | |
def has_hanzi(text): | |
return re.search(u"[\u3400-\u9fff]", text) | |
def get_character_transcription(hanzi, transcription=None): | |
if transcription == None: | |
transcription = chinese_support_config.options['transcription'] | |
if "Pinyin" == transcription: | |
text = db.get_pinyin(hanzi) | |
elif "Pinyin (Taiwan)" == transcription: | |
text = db.get_pinyin(hanzi, taiwan=True) | |
elif "Cantonese" == transcription: | |
text = db.get_cantonese(hanzi) | |
elif "Bopomofo" == transcription: | |
text = db.get_pinyin(hanzi, taiwan=True) | |
text = bopomofo_module.bopomofo(no_accents(text)) | |
else: | |
text = "" | |
return text | |
def add_diaeresis(text): | |
try: | |
return re.sub(u"v", u"ü", text) | |
except: | |
return "" | |
def local_dict_colorize(txt, ruby=True): | |
""" | |
Colorize text in the form : | |
"Hello is written 你好[ni3 hao]" | |
(as used in the local dictionaries) | |
""" | |
def _sub(p): | |
c = "" | |
hanzi = p.group(1) | |
pinyin = p.group(2) | |
pinyin = accentuate_pinyin(pinyin) | |
if ruby: | |
if 1 == hanzi.count("|"): | |
hanzi = hanzi.split("|") | |
c += colorize_fuse(hanzi[0], pinyin, True) | |
c += "|" | |
c += colorize_fuse(hanzi[1], pinyin, True) | |
else: | |
c += colorize_fuse(hanzi, pinyin, True) | |
else: | |
if 1 == hanzi.count("|"): | |
#Hanzi has 2 variants (traditional and simplified) | |
hanzi = hanzi.split("|") | |
c += colorize_fuse(hanzi[0], pinyin, False) | |
c += "|" | |
c += colorize_fuse(hanzi[1], pinyin, False) | |
else: | |
c += colorize_fuse(hanzi, pinyin, False) | |
c += "[" + colorize(pinyin) + "]" | |
return c | |
txt = re.sub(u"([\u3400-\u9fff|]+)\\[(.*?)\\]", _sub, txt) | |
return txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment