Skip to content

Instantly share code, notes, and snippets.

@joeminicucci
Last active August 29, 2015 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joeminicucci/646bd16b672ce5aeff8d to your computer and use it in GitHub Desktop.
Save joeminicucci/646bd16b672ce5aeff8d to your computer and use it in GitHub Desktop.
chinese plugin mods
# -*- coding: utf-8 -*-
#
# Copyright © 2014 Thomas TEMPÉ, <thomas.tempe@alysse.org>
#
# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html
#
#COPYRIGHT AND PERMISSION NOTICE
#Copyright © 1991-2012 Unicode, Inc. All rights reserved. Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
#Permission is hereby granted, free of charge, to any person obtaining a copy of the Unicode data files and any associated documentation (the "Data Files") or Unicode software and any associated documentation (the "Software") to deal in the Data Files or Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sell copies of the Data Files or Software, and to permit persons to whom the Data Files or Software are furnished to do so, provided that (a) the above copyright notice(s) and this permission notice appear with all copies of the Data Files or Software, (b) both the above copyright notice(s) and this permission notice appear in associated documentation, and (c) there is clear notice in each modified Data File or in the Software as well as in the documentation associated with the Data File(s) or Software that the data or software has been modified.
#THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA FILES OR SOFTWARE.
#Except as contained in this notice, the name of a copyright holder shall not be used in advertising or otherwise to promote the sale, use or other dealings in these Data Files or Software without prior written authorization of the copyright holder.
"""Interface to the db/chinese_dict.sql SQLite database containing the local dictionaries
Available dictionaries:
* Chinese characters (Unihan)
* Chinese words (CEDICT), including:
* simplified and traditional spellings
* pinyin and Taiwan variant pronunciations
* English, German and French translations
unihan table structure:
["cp", "kMandarin", "kCantonese", "kFrequency", "kHangul", "kJapaneseKun", "kSimplifiedVariant", "kTraditionalVariant", "Vietnamese"]
cidian table structure:
["traditional", "simplified", "pinyin", "pinyin_taiwan", "classifiers", "alternates", "english", "german", "french", "spanish"]
"""
import sqlite3
import os.path
class DictDB:
conn = None
c = None
def __init__(self):
try:
from aqt import mw
db_file = os.path.join(mw.pm.addonFolder(), "chinese", "db", "chinese_dict.sqlite")
except: #Used for local debugging
db_file = "db/chinese_dict.sqlite"
self.conn=sqlite3.connect(db_file)
self.c = self.conn.cursor()
#Create the DB indexes.
#Only works the first time.
#These indexes are removed from the distribution files, in order to save space
try:
self.c.execute("create index isimplified on cidian ( simplified );")
self.c.execute("create unique index itraditional on cidian ( traditional, pinyin );")
self.conn.commit()
except:
pass
def _get_char_pinyin(self, c):
"""returns the pinyin transcription of a given Hanzi from Unihan.
If it's not in the dictionary, return the original text.
If there are multiple possibilities, returns one at random.
"""
self.c.execute("select kMandarin from hanzi where cp = ?;", (c,) )
try:
(pinyin,) = self.c.fetchone()
return pinyin
except:
return None
#!Changes! (There was an problem with Taiwan_pinyin being converted one-to-one instead of by each word group.
# Originally, this code would check if (len(taiwan_pinyin)>0). However, this would result in an exception because if a word was not found in the dictionary
# None would be returned, not an empty string. I've changed it to check for a None instead. Only if the pinyin (not the taiwan_pinyin) is not in the dictionary
# should an exception be thrown (because the self.c.execute returns None in this case). I am not sure whether it was intended for the databse to return None, but
# the following code allows Taiwan_pinyin to be checked word-by-word as well.)
def _get_word_pinyin(self, w, taiwan=False):
"""Returns the pinyin transcription of a word, from CEDICT.
If it's not in the dictionary, returns None.
If there are multiple possibilities, returns one at random.
if taiwan==True then prefer Taiwan variant
"""
self.c.execute("select pinyin, pinyin_taiwan from cidian where traditional=? or simplified=?;", (w, w))
try:
pinyin, taiwan_pinyin = self.c.fetchone()
#self.c.fetchone() gives a None value for taiwan_pinyin if the pinyin_taiwan does not exist.
#If the pinyin doesn't exist, self.c.fetchone() returns None and an exception is made due to trying to assign a tuple which does not exist
if taiwan and taiwan_pinyin is not None:
return taiwan_pinyin
else:
return pinyin
except:
#Not in dictionary
return None
#/!Changes!
#!Changes! (I've increased the word length that is searched up in the dictionary, in order to recognise chengyu (4 character idioms) and 3 character words etc.
#However, the word length can also be manually set if 4 is not enough
def get_pinyin(self, w, taiwan=False, wl=4):
"""Returns the full pinyin transcription of a string.
Use CEDICT wherever possible. Use Unihan to fill in.
if taiwan==True then prefer Taiwan variant
"""
p = self._get_word_pinyin(w, taiwan)
if p:
return p #one word, in dictionary
if len(w)==1:
return self._get_char_pinyin(w) #single character
#We're looking up a string that's not in the dictionary
#We'll try each 4-character sequence in turn, then 3-sequence, then 2-sequence and if those fails, do unit lookup.
#transcription = u""
transcription = u""
w = w[:]
last_was_pinyin = False
while len(w)>0:
word_was_found = False
word_len = wl
while word_len > 1:
p = self._get_word_pinyin(w[:word_len], taiwan)
if p:
transcription = add_with_space(transcription, p)
w = w[word_len:]
last_was_pinyin = True
word_was_found = True
break
word_len -= 1
if word_was_found == False:
p = self._get_char_pinyin(w[0])
if p:
transcription = add_with_space(transcription, p)
last_was_pinyin = True
else:
#add character directly.
#Pad with spaces appropriately
if last_was_pinyin:
transcription+=" "
transcription+=w[0]
last_was_pinyin = False
w = w[1:]
return transcription
#!/Changes!#
def get_cantonese(self, w, only_one=True):
"""Returns a character-by-character cantonese transcription."""
t = u""
for c in w:
self.c.execute("select kCantonese from hanzi where cp = ?;", (c,) )
try:
(k,) = self.c.fetchone()
if only_one:
k = k.split(" ")[0]
else:
k = k.replace(" ", "|")
t = add_with_space(t, k)
except:
t+=c
return t
#!Changes! (I've applied the same technique used for converting pinyin, and applied it to converting simplified to traditional vice versa.
# Originally, the characters were being coverted one-for-one, which caused many incorrect conversions. E.g. 面 and 麵. Also, conversion now
# uses CEDICT instead of Unihan except for single characters)
def _get_char_traditional(self, c):
"""Uses Unihan to find a traditional variant"""
self.c.execute("select kTraditionalVariant from hanzi where cp = ?;", (c,) )
try:
(k,) = self.c.fetchone()
return k
except:
return None
def _get_word_traditional(self, w):
"""Uses CEDICT to find a traditional variant"""
self.c.execute("select traditional from cidian where traditional=? or simplified=?;", (w, w) )
try:
(k,) = self.c.fetchone()
return k
except:
return None
def get_traditional(self, w, wl=4):
"""Returns the full traditional form of a string.
Use CEDICT wherever possible. Use Unihan to fill in.
"""
p = self._get_word_traditional(w)
if p:
return p #one word, in dictionary
if len(w)==1:
return self._get_char_traditional(w) #single character
#We're looking up a string that's not in the dictionary
#We'll try each 4-character sequence in turn, then 3-sequence, then 2-sequence and if those fails, do unit lookup.
traditional = u""
w = w[:]
while len(w)>0:
word_was_found = False
word_len = wl
while word_len > 1:
p = self._get_word_traditional(w[:word_len])
if p:
traditional += p
w = w[word_len:]
word_was_found = True
break
word_len -= 1
if word_was_found == False:
p = self._get_char_traditional(w[0])
if p:
traditional += p
else:
#add character directly.
traditional+=w[0]
w = w[1:]
return traditional
def _get_char_simplified(self, c):
"""Uses Unihan to find a simplified variant"""
self.c.execute("select kSimplifiedVariant from hanzi where cp = ?;", (c,) )
try:
(k,) = self.c.fetchone()
return k
except:
return None
def _get_word_simplified(self, w):
"""Uses CEDICT to find a traditional variant"""
self.c.execute("select simplified from cidian where traditional=? or simplified=?;", (w, w) )
try:
(k,) = self.c.fetchone()
return k
except:
return None
def get_simplified(self, w, wl=4):
"""Returns the full traditional form of a string.
Use CEDICT wherever possible. Use Unihan to fill in.
"""
p = self._get_word_simplified(w)
if p:
return p #one word, in dictionary
if len(w)==1:
return self._get_char_simplified(w) #single character
#We're looking up a string that's not in the dictionary
#We'll try each 4-character sequence in turn, then 3-sequence, then 2-sequence and if those fails, do unit lookup.
simplified = u""
w = w[:]
while len(w)>0:
word_was_found = False
word_len = wl
while word_len > 1:
p = self._get_word_simplified(w[:word_len])
if p:
simplified += p
w = w[word_len:]
word_was_found = True
break
word_len -= 1
if word_was_found == False:
p = self._get_char_simplified(w[0])
if p:
simplified += p
else:
#add character directly.
simplified+=w[0]
w = w[1:]
return simplified
#!/Changes!
def get_definitions(self, w, lang):
'''Returns all definitions for a given language.
Lang should be one of en, de, fr, es.
returns a list of
each one in the format (pinyin, definition, classifier, alt_spelling)
'''
langs = {"en":"english", "de":"german", "fr":"french", "es":"spanish"}
self.c.execute("select distinct pinyin, %s as definition, classifiers, alternates from cidian where (traditional=? or simplified=?) and length(definition)>0 order by pinyin;" % langs[lang], (w, w))
try:
return self.c.fetchall()
except:
return []
def get_classifiers(self, txt):
r = []
self.c.execute("select distinct classifiers from cidian where (traditional=? or simplified=?);", (txt, txt))
try:
#fetchall returns a list of tuples, converts to a list of strings
return filter(lambda a:a, map(lambda a:a[0], self.c.fetchall()))
except:
return []
def get_alt_spellings(self, txt):
self.c.execute("select distinct alternates from cidian where (traditional=? or simplified=?);", (txt, txt))
try:
#fetchall returns a list of tuples, converts to a list of strings
return filter(lambda a:a, map(lambda a:a[0], self.c.fetchall()))
except:
return []
def add_with_space(a, b):
if len(a)>0 and " " != a[-1]:
return a+" "+b
return a+b
# -*- coding: utf-8 -*-
# Welcome to the Chinese Support Add-on's field edition ruleset.
# Here, you can tweak the note editor helper's behavior to your liking.
#
# If you messed things up, you can safely delete file
# addons/chinese/edit_behavior.py from your Anki directory.
# It will be recreated the next time you restart Anki.
#
# You can read about all available functions at:
# https://github.com/ttempe/chinese-support-addon/wiki/Edit-behavior
# Also, see the Python tutorial at http://docs.python.org/2/tutorial
""" Note that I've changed almost everything in this file to my liking. I have cleaned up the code into individual functions.
I couldn't figure out what the hell the 1st Case was doing. But I never used the given models anyway.
Basically, I've changed the fields to allow pinyin, pinyin (taiwan), cantonese and bopomofo to all be able to be
used on the same deck. Additionally, English, German and French translations can be given on the same deck.
The problem with having a "setting" change the transcription is that people who are studying both Cantonese and Mandarin,
or who want to use both Pinyin and Bopomofo keep having to change the settings. This way, the setting is automatic per deck.
However, I have left the option to use the setting chosen if so desired. A few other things have been changed as well.
Finally, I have also added "Simplified/TraditionalInclusive" fields where the the simplified/traditional is given even if it
is the same as the hanzi field. (Useful if you are mixing simplified and traditional forms for different cards in the deck,
but still want to display both the simplified and traditional without weird spacing issues when either the simplified or
traditional forms are the same as given.
"""
from config import chinese_support_config
from edit_functions import *
from sentDict import sentDict
anki1_model_names = ["Chinese", "chinese", "Mandarin Vocab"] #"Mandarin" is used by the Pinyin Toolkit port by Chris Hatch
Hanzi_fields = ["Expression", "Hanzi", "Chinese", u"汉字", u"漢字", u"中文"]
Color_fields = ["Color", "Colour", "Colored Hanzi", u"彩色"]
ColorPY_fields = ["ColorPY", "ColourPY"]
ColorPYTW_fields = ["ColorPYTW", "ColourPYTW"]
ColorCANT_fields = ["ColorCANT", "ColourCANT"]
ColorBPMF_fields = ["ColorBPMF", "ColourBPMF"]
Transcription_fields = ["Reading"]
Pinyin_fields = ["Pinyin", "PY", u"拼音", u"大陆拼音", u"大陸拼音"]
PinyinTW_fields = ["PinyinTW", "PYTW", u"臺灣拼音", u"台灣拼音", u"台湾拼音"]
Cantonese_fields = ["Cantonese", "Jyutping", u"廣東話", u"广东话", u"粵語", u"粤语",u"廣州話", u"广州话", u"粵", u"粤", u"粵拼", u"粤拼"]
Bopomofo_fields = [u"注音符號" u"註音符號", u"注音符号", "Bopomofo", u"ㄅㄆㄇㄈ"]
Meaning_fields = ["Meaning", "Definition", u"意思", u"翻译", u"翻譯", u"解释", u"解釋"]
English_fields = ["English", u"英语", u"英語", u"英文"]
German_fields = ["German", "Deutsch", u"德语", u"德語", u"德文"]
French_fields = ["French", "le français", u"法语", u"法語", u"法文"]
Sound_fields = ["Audio", "Sound", "Spoken", u"声音", u"聲音"]
#Excludes simplified/traditional pairs which are the same
Simplified_fields = ["Simplified", "Simp", "Simp.", u"简体", u"簡體", u"简化", u"簡化", u"简体字", u"簡體字", u"简化字", u"簡化字"]
Traditional_fields = ["Traditional", "Trad", "Trad.", u"繁体", u"繁體", u"繁体字", u"繁體字"]
#Includes simplified/traditional pairs which are the same
SimplifiedInclusive_fields = ["Simplified Inclusive"]
TraditionalInclusive_fields = ["Traditional Inclusive"]
Mean_Word_fields = ["Mean Word", "Measure Word", "MW", "Mean", "Classifier", u"量词", u"量詞"]
Alternate_fields = ["Also writted", "Alt", "Alternate"]
Ruby_fields = ["Ruby"]
RubyPY_fields = ["RubyPY"]
RubyPYTW_fields = ["RubyPYTW"]
RubyCANT_fields = ["RubyCANT"]
RubyBPMF_fields = ["RubyBPMF"]
Silhouette_fields = ["Silhouette"]
Sentence_fields = ["Usage", "Sentences", "Usage"]
def update_fields(field, updated_field, model_name, model_type):
#1st case : the new Ruby-based model
if model_type == "Chinese Ruby":
if updated_field == "Hanzi":
#Update the ruby
h = colorize(ruby(accentuate_pinyin(field["Hanzi"])))
#Add the toneless transcription and hanzi, hidden,
#to make them searchable
h = hide_ruby(h)
field["Hanzi"] = h
if field["Hanzi"] == "":
field["Meaning"] = ""
elif field["Meaning"] == "":
field["Meaning"] = translate( field["Hanzi"] )
elif updated_field[0:5] == "Hanzi":#Field name starts with "Hanzi"
field[updated_field] = \
colorize( ruby( accentuate_pinyin( field[updated_field] ) ) )
#2nd case : use the old Anki1 Pinyin-toolkit rule1s if the deck is
#called "Chinese" or was created as "Chinese (compatibility)" from
#Anki2.
#Note that we accept multiple field names for each field, to ensure
#Anki1 compatibility.
else:
#Define Functions
def update_Meaning_fields():
#Update Meaning field only if empty.
m = ""
if get_any(Meaning_fields, field) == "" :
m = translate(field[updated_field])
#If there's no mean word field, then add it here
if not has_field(Mean_Word_fields, field):
mw = get_mean_word(field[updated_field])
if mw:
m += "<br>Cl: "+mw
#If there's no alt spelling field, then add it here
if not has_field(Alternate_fields, field):
mw = get_alternate_spellings(field[updated_field])
if mw:
m += "<br>Also written: "+mw
set_all(Meaning_fields, field, to = m)
#Translate to English
m = ""
if get_any(English_fields, field) == "" :
m = translate(field[updated_field], "zh", "local_en")
#If there's no mean word field, then add it here
if not has_field(Mean_Word_fields, field):
mw = get_mean_word(field[updated_field])
if mw:
m += "<br>Cl: "+mw
#If there's no alt spelling field, then add it here
if not has_field(Alternate_fields, field):
mw = get_alternate_spellings(field[updated_field])
if mw:
m += "<br>Also written: "+mw
set_all(English_fields, field, to = m)
#Translate to German
m = ""
if get_any(German_fields, field) == "" :
m = translate(field[updated_field], "zh", "local_de")
#If there's no mean word field, then add it here
if not has_field(Mean_Word_fields, field):
mw = get_mean_word(field[updated_field])
if mw:
m += "<br>Cl: "+mw
#If there's no alt spelling field, then add it here
if not has_field(Alternate_fields, field):
mw = get_alternate_spellings(field[updated_field])
if mw:
m += "<br>Also written: "+mw
set_all(German_fields, field, to = m)
#Translate to French
m = ""
if get_any(French_fields, field) == "" :
m = translate(field[updated_field], "zh", "local_fr")
#If there's no mean word field, then add it here
if not has_field(Mean_Word_fields, field):
mw = get_mean_word(field[updated_field])
if mw:
m += "<br>Cl: "+mw
#If there's no alt spelling field, then add it here
if not has_field(Alternate_fields, field):
mw = get_alternate_spellings(field[updated_field])
if mw:
m += "<br>Also written: "+mw
set_all(French_fields, field, to = m)
return
def update_Mean_Word_fields():
#Update Mean word field only if empty.
if get_any(Mean_Word_fields, field) == "" :
m = get_mean_word(field[updated_field])
set_all(Mean_Word_fields, field, to = m)
return
def update_Alternative_fields():
#Update alt spelling field only if empty.
if get_any(Alternate_fields, field) == "" :
m = get_alternate_spellings(field[updated_field])
set_all(Alternate_fields, field, to = m )
return
def update_Silhouette_fields():
m = silhouette(get_any(Hanzi_fields,field))
set_all(Silhouette_fields, field, to = m)
return
def update_all_Transcription_fields():
#Update transcription fields
#Only if it's empty
if get_any(Transcription_fields, field) == "" :
t = colorize( transcribe( no_sound( field[updated_field] ) ) )
#Hide the unaccented transcription in the field,
#to make searching easier
t = hide(t, no_tone(t))
set_all(Transcription_fields, field, to = t )
if get_any(Pinyin_fields, field) == "" :
t = colorize( transcribe( no_sound( field[updated_field] ), "Pinyin") )
t = hide(t, no_tone(t))
set_all(Pinyin_fields, field, to = t )
if get_any(PinyinTW_fields, field) == "" :
t = colorize( transcribe( no_sound( field[updated_field] ), "Pinyin (Taiwan)") )
t = hide(t, no_tone(t))
set_all(PinyinTW_fields, field, to = t )
if get_any(Cantonese_fields, field) == "" :
t = colorize( transcribe( no_sound( field[updated_field] ), "Cantonese", False ) )
t = hide(t, no_tone(t))
set_all(Cantonese_fields, field, to = t )
if get_any(Bopomofo_fields, field) == "" :
t = colorize( transcribe( no_sound( field[updated_field] ), "Bopomofo") )
t = hide(t, no_tone(t))
set_all(Bopomofo_fields, field, to = t )
return
def update_all_Color_fields():
#Update Color fields from the Hanzi field,
h = no_sound( get_any(Hanzi_fields,field))
#Take the tone info from the Transcription field
t = no_sound( no_color(get_any(Transcription_fields, field) ) )
c = colorize_fuse( h, t )
set_all(Color_fields, field, to = c )
#Take the tone info from the Pinyin field
t = no_sound( no_color(get_any(Pinyin_fields, field) ) )
c = colorize_fuse( h, t )
set_all(ColorPY_fields, field, to = c )
set_all(Color_fields, field, to = c )
#Take the tone info from the PinyinTW field
t = no_sound( no_color(get_any(PinyinTW_fields, field) ) )
c = colorize_fuse( h, t )
set_all(ColorPYTW_fields, field, to = c )
#Take the tone info from the Cantonese field
t = no_sound( no_color(get_any(Cantonese_fields, field) ) )
#z = no_sound( get_any(Traditional_fields,field))
if get_any(Traditional_fields, field) == "":
h = traditional(h)
c = colorize_fuse( h, t )
set_all(ColorCANT_fields, field, to = c )
#Take the tone info from the Bopomofo field
t = no_sound( no_color(get_any(Bopomofo_fields, field) ) )
c = colorize_fuse( h, t )
set_all(ColorBPMF_fields, field, to = c )
return
def update_Sound_fields():
#Update Sound field from Hanzi field if non-empty
#(only if field actually exists, as it implies downloading
#a soundfile from Internet)
if has_field(Sound_fields, field) and \
get_any(Sound_fields, field)=="":
set_all(Sound_fields, field, to = sound(field[updated_field]))
return
def update_all_Simplified_Traditional_fields():
#Update simplified/traditional fields
s = simplify(field[updated_field])
set_all(SimplifiedInclusive_fields, field, to = s )
if s <> field[updated_field]:
set_all(Simplified_fields, field, to = s )
else:
set_all(Simplified_fields, field, to = "" )
t = traditional(field[updated_field])
set_all(TraditionalInclusive_fields, field, to = t )
if t <> field[updated_field]:
set_all(Traditional_fields, field, to = t )
else:
set_all(Traditional_fields, field, to = "" )
return
def update_all_Ruby_fields():
#Update ruby fields
#m = colorize_fuse(get_any(Hanzi_fields, field), get_any(Transcription_fields, field), ruby=True)
#set_all(Ruby_fields, field, to = m)
m = colorize_fuse(get_any(Hanzi_fields, field), get_any(Pinyin_fields, field), ruby=True)
set_all(RubyPY_fields, field, to = m)
set_all(Ruby_fields, field, to = m)
m = colorize_fuse(get_any(Hanzi_fields, field), get_any(PinyinTW_fields, field), ruby=True)
set_all(RubyPYTW_fields, field, to = m)
m = colorize_fuse(get_any(Hanzi_fields, field), get_any(Bopomofo_fields, field), ruby=True)
set_all(RubyBPMF_fields, field, to = m)
m = colorize_fuse(get_any(Hanzi_fields, field), get_any(Cantonese_fields, field), ruby=True)
if get_any(Traditional_fields, field) != "":
m = traditional(m)
set_all(RubyCANT_fields, field, to = m)
return
def update_sentence_fields():
u = sentDict()
if get_any(Sentence_fields, field) == "": #and field[updated_field] != "":
u = u.senticize(field[updated_field],1000)
set_all(Sentence_fields, field, to = u)
def erase_fields():
#Erase other fields if the updated field was emptied
if field[updated_field]=="":
set_all(Meaning_fields, field, to="")
set_all(English_fields, field, to="")
set_all(German_fields, field, to="")
set_all(French_fields, field, to="")
set_all(Transcription_fields, field, to="")
set_all(Pinyin_fields, field, to="")
set_all(PinyinTW_fields, field, to="")
set_all(Cantonese_fields, field, to="")
set_all(Bopomofo_fields, field, to="")
set_all(Sound_fields, field, to="")
set_all(Simplified_fields, field, to="")
set_all(Traditional_fields, field, to="")
set_all(SimplifiedInclusive_fields, field, to="")
set_all(TraditionalInclusive_fields, field, to="")
set_all(Mean_Word_fields, field, to="")
set_all(Alternate_fields, field, to="")
set_all(Ruby_fields, field, to="")
set_all(RubyPY_fields, field, to="")
set_all(RubyPYTW_fields, field, to="")
set_all(RubyCANT_fields, field, to="")
set_all(RubyBPMF_fields, field, to="")
set_all(Silhouette_fields, field, to="")
set_all(Sentence_fields, field, to="")
return
#Fields to update after the Hanzi field has been modified:
if updated_field in Hanzi_fields:
erase_fields()
update_Meaning_fields()
update_Mean_Word_fields()
update_Alternative_fields()
update_Silhouette_fields()
update_all_Transcription_fields()
update_all_Color_fields()
update_Sound_fields()
update_all_Simplified_Traditional_fields()
update_all_Ruby_fields()
update_sentence_fields()
#If the transcription was modified, update the Color field
elif updated_field in Transcription_fields:
t = colorize(accentuate_pinyin(separate_pinyin(no_color(field[updated_field]))))
t = hide(t, no_tone(t))
field[updated_field] = t
update_all_Color_fields()
update_all_Ruby_fields()
elif updated_field in Pinyin_fields:
t = colorize(accentuate_pinyin(separate_pinyin(no_color(field[updated_field]), True), True))
t = hide(t, no_tone(t))
field[updated_field] = t
update_all_Color_fields()
update_all_Ruby_fields()
elif updated_field in PinyinTW_fields:
t = colorize(accentuate_pinyin(separate_pinyin(no_color(field[updated_field]), True), True))
t = hide(t, no_tone(t))
field[updated_field] = t
#Also update Bopomofo
set_all(Bopomofo_fields, field, to=pinyin_to_bopomofo(t))
update_all_Color_fields()
update_all_Ruby_fields()
elif updated_field in Cantonese_fields:
t = colorize(separate_pinyin(no_color(field[updated_field]), True, True))
t = hide(t, no_tone(t))
field[updated_field] = t
update_all_Color_fields()
update_all_Ruby_fields()
elif updated_field in Bopomofo_fields:
t = no_color(field[updated_field])
t = hide(t, no_tone(t))
field[updated_field] = t
update_all_Color_fields()
update_all_Ruby_fields()
#If the traditional/simplified inclusive fields were modified, update the others
elif updated_field in SimplifiedInclusive_fields:
s = field[updated_field]
if s == "" :
s = get_any(Hanzi_fields, field)
if s <> get_any(Hanzi_fields, field) :
set_all(Simplified_fields, field, to = s )
else:
set_all(Simplified_fields, field, to = "" )
elif updated_field in Simplified_fields:
s = field[updated_field]
if s == "" :
set_all(SimplifiedInclusive_fields, field, to = get_any(Hanzi_fields, field))
else:
set_all(SimplifiedInclusive_fields, field, to = s )
elif updated_field in TraditionalInclusive_fields:
t = field[updated_field]
if t == "" :
t = get_any(Hanzi_fields, field)
if t <> get_any(Hanzi_fields, field) :
set_all(Traditional_fields, field, to = t )
else:
set_all(Traditional_fields, field, to = "" )
elif updated_field in Simplified_fields:
t = field[updated_field]
if t == "" :
set_all(TraditionalInclusive_fields, field, to = get_any(Hanzi_fields, field))
else:
set_all(TraditionalInclusive_fields, field, to = t )
return field
# -*- coding: utf-8 -*-
#
# Copyright © 2012 Thomas TEMPÉ, <thomas.tempe@alysse.org>
#
# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html
from aqt import mw
import re
from config import chinese_support_config
import bopomofo as bopomofo_module
import google_tts
import baidu_tts
from microsofttranslator import Translator as MSTranslator
import dictdb
# Essential Edit functions
##################################################################
#
# You may call any of these functions from the edit_behavior.py file.
def colorize(text, ruby_whole=False):
u'''Add tone color info.
(can be seen in the card preview, but not the note edit view).
Works on transcription, hanzi or ruby.
In the case of ruby, it will colorize only the annotation by default.
If ruby_whole = True, then it will colorize the whole character.
Warning : it's not recommended to use this function on hanzi directly,
since it cannot choose the correct color in the case of
多音字 (characters with multiple pronunciations).'''
text = no_color(text)
(text, sound_tags) = extract_sound_tags(text)
def colorize_hanzi_sub(p):
return u'<span class="tone{t}">{r}</span>'.format(t=get_tone_number(transcribe(p.group(1), only_one=True)), r=p.group())
def colorize_pinyin_sub(p):
pinyin = p.group()
if pinyin[0] in '&<"/':
return pinyin
else:
return u'<span class="tone{t}">{r}</span>'.format(t=get_tone_number(p.group(1)), r=pinyin)
if has_ruby(text): #Treat like ruby
if ruby_whole:
def colorize_ruby_sub(p):
return u'<span class="tone{t}">{r}</span>'.format(t=get_tone_number(p.group(2)), r=p.group())
text = re.sub(u'([\u3400-\u9fff]\[\s*)([a-zü'+accents+u']+1?[0-9¹²³⁴]?)(.*?\])', colorize_ruby_sub, text, flags=re.I)
else:
text = re.sub(u'([a-zü'+accents+u']+1?[0-9¹²³⁴]?)', colorize_pinyin_sub, text, flags=re.I)
elif has_hanzi(text):
text = re.sub(u'([\u3400-\u9fff])', colorize_hanzi_sub, text)
else:
text = re.sub(u'([&<"/]?[a-zü'+accents+u']+1?[0-9¹²³⁴]?)', colorize_pinyin_sub, text, flags=re.I)
text = text+sound_tags
return text
def ruby_top(txt):
"Extract the top (pronunciation) part of a ruby string."
r = r' ?([^ >]+?)\[(.+?)\]'
return re.sub(r, r'\2 ', no_sound(txt))
def ruby_bottom(txt):
"Extract the bottom part of a ruby string."
r = r' ?([^ >]+?)\[(.+?)\]'
text = re.sub(r, r'\1 ', no_sound(txt))
return text
def no_color(text):
"Remove tone color info and other HTML pollutions"
if text == None:
return ""
text = text.replace(r'&nbsp;', '')
text = no_hidden(text)
#remove color info
text = re.sub(r'<span class="tone1?[0-9]">(.*?)</span>', r'\1', text)
#remove black font tag sometimes added by Anki
text = re.sub(r'<font color="#000000">(.*?)</font>', r'\1', text)
return text
def hide(text, hidden):
"""Add hidden keyword to string (typically Hanzi and toneless pinyin),
to make a note searchable in the 'browse' window
"""
if len(text) == 0 or text == "<br />":
return ""
hidden = no_color(hidden)
hidden = hidden.replace("<.*?>", "")
hidden = hidden.replace(r"[<!->]", "")
return text + "<!--"+hidden+"-->"
def hide_ruby(text):
"""Append hidden hanzi and toneless pinyin to a ruby string,
to make a note searchable in the 'browse' window.
"""
t = no_tone(ruby_top(text))
t += no_color(ruby_bottom(text)).replace(" ", "")
return hide(text, t)
def silhouette(hanzi):
"""Replaces each Chinese character by a blank space.
Eg: 以A为B -> _A_B
Eg: 哈密瓜 -> _ _ _
"""
def insert_spaces(p):
r = ""
for i in p.group(0):
r += i + " "
return r[:-1]
hanzi = re.sub(u"[\u3400-\u9fff]+", insert_spaces, hanzi)
txt = re.sub(u"[\u3400-\u9fff]", "_", hanzi)
return txt
def no_hidden(text):
"""Remove hidden keyword string"""
return re.sub(r"<!--.*?-->", "", text)
def accentuate_pinyin(text, force=False):
u'''Add accents to pinyin.
Eg: ni2 becomes ní.
Eg: ní4 becomes nì. (to make correction easier)
Does nothing if the default transcription is not Pinyin or Pinyin (Taiwan),
unless force=True.
Nota : also removes coloring. If you want color, please add it last.
'''
def accentuate_pinyin_sub(p):
pinyin = p.group(1)
tone = p.group(2)
if "tone"==pinyin:
return pinyin+tone
# for v in accents:
# re.sub(v, base_letters[v], pinyin)
pinyin = no_tone(pinyin)
for v in u"aeiouüvAEIOUÜV":
if pinyin.find(v)>-1:
try:
return re.sub(v, vowel_decorations[int(tone)][v.lower()], pinyin, count=1)
except KeyError, IndexError:
pass
return pinyin
if chinese_support_config.options['transcription'] \
not in ['Pinyin', 'Pinyin (Taiwan)'] and not force:
return text
text = no_color(text)
text = re.sub(u'([a-z]*[aeiouüÜv'+accents+u'][a-zü]*)([1-5])', accentuate_pinyin_sub, text, flags=re.I)
return text
#!Changes! (There was a problem with 5th tone in pinyin not being converted to 5. This caused problems with bopomofo colouring
#because the bopomofo conversion could not put the appropriate symbol for 5th tone (since no '5' was given) and it appeared to the
#colouriser to be a 1st tone (since in bopomofo, 1st tone has no symbol, like how pinyin has no symbol for 5th tone)
#Adding the unaccented vowels removed this problem.
def no_accents(text):
u'Eg: ní becomes ni2.'
def desaccentuate_pinyin_sub(p):
return ""+p.group(1)+base_letters[p.group(2).lower()]+p.group(3)+get_tone_number(p.group(2).lower())
#Remove +u'aeiouüvAEIOUÜV' if you want 5th tone to be ignored
return re.sub(u'([a-zü]*)(['+u'aeiouüvAEIOUÜV'+accents+u'])([a-zü]*)', desaccentuate_pinyin_sub, text, flags=re.I)
#!/Changes!
def ruby(text, transcription=None, only_one=False, try_dict_first=True):
u'''Convert hanzi to ruby notation, eg: '你' becomes '你[nǐ]'.
This can in turn be used with the {{Ruby:fieldname}} card template,
to generate beautiful ruby-annotated cards.
If not specified, use the transcription type set in the menubar (eg pinyin).
if try_dict_first, looks up sequences of characters in the
selected words dictionary to supply a better transcription.
If not specified, insert all possible pinyin words for characters not found
in words dictionary.
'''
if transcription == None:
transcription = chinese_support_config.options['transcription']
#Replace Chinese typography with its ASCII counterpart
text = re.sub(u'[[【]', u'[', text)
text = re.sub(u'[]】]', u']', text)
#Strip former HTML tone marking and comments
text = no_color(text)
text = no_sound(text)
#Make sure sound tag isn't confused with Hanzi
text = re.sub(u'([\u3400-\u9fff])(\[sound:)', r'\1 \2', text)
def insert_multiple_pinyin_sub(p):
hanzi=p.group(1)
transc = db.get_pinyin(hanzi)
if not transc:
return p.group()
transc = transc.split(" ")
ret = ""
hanzi = p.group(1)
while len(hanzi):
if "Pinyin" == transcription:
ret += hanzi[0] + "["+transc.pop(0)+"]"
elif "Bopomofo" == transcription:
ret += hanzi[0] + "["
ret += bopomofo_module.bopomofo(no_accents(transc.pop(0)))+"]"
hanzi = hanzi[1:]
return ret+p.group(2)
def insert_pinyin_sub(p):
return p.group(1)+'['+get_character_transcription(p.group(1), transcription, only_one)+']'+p.group(2)
text += '%'
if try_dict_first and transcription in ["Pinyin", "Bopomofo"]:
text = re.sub(u'([\u3400-\u9fff]+)([^[])', insert_multiple_pinyin_sub, text)
text = re.sub(u'([\u3400-\u9fff])([^[])', insert_pinyin_sub, text)
text = re.sub(u'([\u3400-\u9fff])([^[])', insert_pinyin_sub, text)
text = text[:-1]
text += sound(text)
return text
def no_tone(text):
u'''Removes tone information and coloring.
Eg: 'ni3' becomes 'ni', 'má' becomes 'ma'
'''
text = no_color(text)
text = no_accents(text)
def no_tone_marks_sub(p):
return ""+p.group(1)+re.sub(r'1?[0-9¹²³⁴]', '', p.group(2))+"]"
if has_ruby(text):
text = re.sub(u'([\u3400-\u9fff]\[)([^[]+?)\]', no_tone_marks_sub, text)
else:
text = re.sub(u'([a-zü]+)1?[0-9¹²³⁴]', r'\1', text)
return text
def hanzi(text):
u'''Returns just the anzi from a Ruby notation.
Eg: '你[nǐ][You]' becomes '你'.
'''
text = re.sub(u'([\u3400-\u9fff])(\[[^[]+?\])', r'\1', text)
text = re.sub(r'\[sound:.[^[]+?\]', '', text)
text = re.sub(r'([^\u3400-\u9fff])\[[^[]+?\]\s*$', r'\1', text)
return text
def transcribe(text, transcription=None, only_one=True):
u'''
Converts to specified transcription.
Eg : 你 becomes nǐ (transcription="Pinyin", only_one=True)
Pinyin, Taiwan Pinyin and Bopomofo: lookup in local words dictionaries
first, and use characters dictionary as a backup.
If no transcription is specified, use the transcription set in the menu.
'''
text = cleanup(text)
if text == "":
return ""
if None == transcription:
transcription = chinese_support_config.options["transcription"]
if "Pinyin" == transcription:
r = db.get_pinyin(text, taiwan=False)
elif "Pinyin (Taiwan)" == transcription:
r = db.get_pinyin(text, taiwan=True)
elif "Cantonese" == transcription:
r = db.get_cantonese(text, only_one)
elif "Bopomofo" == transcription:
r = db.get_pinyin(text, taiwan=True)
r = bopomofo_module.bopomofo(no_accents(r))
else:
r = ""
return r
#!Changes! (I've added an extra function which converts pinyin to bopomofo)
def pinyin_to_bopomofo(pinyin):
u'''
Converts Pinyin to Bopomofo.
'''
return bopomofo_module.bopomofo(no_accents(cleanup(pinyin)))
#!/Changes!
#Should this function here be removed? It appears to do nothing and get_alternate_spellings is defined later on.
def get_alt(text):
"""Returns alternate spelling of Chinese expression"""
def translate_local(text, lang):
"""Translate using local dictionary.
lang is one of "en", "fr", "de", "es"
"""
defs = db.get_definitions(text, lang)
if 0 == len(defs):
return ""
def are_there_multiple_pinyins(defs):
(prev_p, a, b, c)= defs[0]
for (pinyin, definition, cl, alt) in defs:
if pinyin<>prev_p:
return True
return False
res = ""
if are_there_multiple_pinyins(defs):
for (pinyin, definition, cl, alt) in defs:
res += u"❖ %s[%s] %s\n" % (text, pinyin, definition)
else:
for (pinyin, definition, cl, alt) in defs:
res += " \t"+definition+"\n"
res = res.replace("\n", "\n<br>")
res = local_dict_colorize(res)
return res
def translate(text, from_lang="zh", to_lang=None, progress_bar=True):
u'''Translate to a different language.
Eg: '你好' becomes 'Hello'
Only installed dictionaries can be used.
to_lang possible values : "local_en", "local_de", "local_fr"
or a 2-letter ISO language code for MS Translate
if to_lang is unspecified, the default language will be used.
if progress_bar is True, then will display a progress bar.
'''
global MS_translator_object
text = cleanup(text)
if "" == text:
return ""
if None == to_lang:
to_lang = chinese_support_config.options["dictionary"]
if "None" == to_lang:
return ""
if to_lang.startswith("local_"): #Local dict
return translate_local(text, to_lang[-2:])
else: #Ms translate
ret = ""
if progress_bar:
mw.progress.start(label="MS Translator lookup", immediate=True)
if None == MS_translator_object:
MS_translator_object = MSTranslator("chinese-support-add-on", "Mh+X5YY17LZZ8rO9hzJXYD3I02V3E+ltItF15ep7qG8=")
try:
ret = MS_translator_object.translate(text, to_lang)
except:
pass
if "ArgumentException:" == ret[:18]:
#Token has probably expired
ret=""
if progress_bar:
mw.progress.finish()
return ret
def cleanup(txt):
if not txt:
return ""
txt = re.sub(r"<.*?>", "", txt, flags=re.S)
txt = txt.replace("&nbsp;", " ")
txt = re.sub(r"^\s*", "", txt)
txt = re.sub(r"\s*$", "", txt)
# txt = re.sub(r"[\s+]", " ", txt)
return txt
def colorize_fuse(hanzi, pinyin, ruby=False):
u'''Gives color to a Hanzi phrase based on the tone info from a
corresponding Pinyin phrase.
If ruby = True, then annotate with pinyin on top of each character
Eg: "你好" and "ni3 hao3" -> 你好 (both colorized as 3rd tone).
'''
pinyin = cleanup(no_color(pinyin))+" "*len(hanzi)
hanzi = cleanup(hanzi)
text = ""
# print hanzi, "\t", pinyin
for h in hanzi:
if len(pinyin)<5:
pinyin = pinyin+" "
if has_hanzi(h):
[p, pinyin] = pinyin.split(" ", 1)
# print "C1\t", h, "\t", p
if ruby:
text += u'<span class="tone{t}"><ruby>{h}<rt>{p}</rt></span>'.format(t=get_tone_number(p), h=h, p=p)
else:
text += u'<span class="tone{t}">{h}</span>'.format(t=get_tone_number(p), h=h)
elif " "==h and " "!=pinyin[0]:
text += " "
# print "C2\t_\t(none)"
else:
# print "C3\t", h, "\t", pinyin[0]
text += pinyin[0]
pinyin = pinyin[1:]
if " " == pinyin[0]:
pinyin = pinyin[1:]
return text
def pinyin(text):
return transcribe(text, transcription="Pinyin")
def get_mean_word(text):
if text == "":
return ""
cl = db.get_classifiers(text)
if len(cl):
return local_dict_colorize(", ".join(cl))
else:
return ""
def get_alternate_spellings(text):
if text == "":
return ""
alt = db.get_alt_spellings(text)
if len(alt):
return local_dict_colorize(", ".join(alt))
else:
return ""
def sound(text, source=None):
'''
Returns sound tag for a given Hanzi string.
If the sound does not already exist in the media directory, then
attempt to obtain it from the specified source.
if the specified source is omitted, use the one selected in the
tools menu.
If it fails (eg: no network connexion while trying to retrieve
speech from Google TTS), return empty string.
Does not work with pinyin or other transcriptions.
'''
text = cleanup(text)
if None==source:
source = chinese_support_config.options['speech']
text = no_color(no_accents(no_sound(text)))
text = re.sub("<.*?>", "", text)
if has_ruby(text):
text = hanzi(text)
if "" == text:
return ""
if "Google TTS Mandarin" == source:
try:
return "[sound:"+google_tts.get_word_from_google(text)+"]"
except:
return ""
elif "Baidu Translate" == source:
try:
return "[sound:"+baidu_tts.get_word_from_baidu(text)+"]"
except:
return ""
else:
return ""
def check_for_sound(text):
'''
Returns True if the soundfile arleady exists in the user's resources directory.
'''
text = cleanup(text)
text = no_color(no_accents(no_sound(text)))
text = re.sub("<.*?>", "", text)
if has_ruby(text):
text = hanzi(text)
if "" == text:
return False
if google_tts.check_resources(text):
return True
return False
def get_any(fields, dico):
u'''Get the 1st valid field from a list
Scans all field names listed as "fields", to find one that exists,
then returns its value.
If none exists, returns an empty string.
Case-insensitive.
'''
for f in fields:
for k, v in dico.iteritems():
try:
if unicode(f.lower()) == unicode(k.lower()):
return dico[k]
except:
pass
return ""
def set_all(fields, dico, to):
u'''Set all existing fields to the same value.
(Non-existing fields are ignored)
Case-insensitive.
'''
for f in fields:
for d, v in dico.iteritems():
try:
if unicode(d.lower()) == unicode(f.lower()):
dico[d] = to
except:
pass
def has_field(fields, dico):
u'''
Check if one of the named fields exists in the field list
Case-insensitive.
'''
for d, v in dico.iteritems():
for f in fields:
try:
if unicode(f.lower()) == unicode(d.lower()):
return True
except:
pass
return False
def no_sound(text):
u'''
Removes the [sound:xxx.mp3] tag that's added by Anki when you record
sound into a field.
If you don't remove it before taking data from one field to another,
it will likely be duplicated, and the sound will play twice.
'''
return re.sub(r'\[sound:.*?]', '', text)
#!Changes! (I've added the ability to separate Jyutping as well)
def separate_pinyin(text, force=False, cantonese=False):
u"""
Separate pinyin syllables with whitespace.
Eg: "Yīlù píng'ān" becomes "Yī lù píng ān"
Does nothing if the default transcription is not Pinyin or Pinyin (Taiwan),
unless force="Pinyin" or force="Pinyin (Taiwan)" or force=True
Cantonese sets whether or not the text being separated is cantonese (if force=True).
Useful for people pasting Pinyin from Google Translate.
"""
if (chinese_support_config.options['transcription'] \
in ['Pinyin', 'Pinyin (Taiwan)'] and not force) or (force and not cantonese):
def clean(t):
'remove leading apostrophe'
if "'" == t[0]:
return t[1:]
return t
def separate_pinyin_sub(p):
return clean(p.group("one"))+" "+clean(p.group("two"))
text = pinyin_two_re.sub(separate_pinyin_sub, text)
#text = pinyin_two_re.sub(separate_pinyin_sub, text)
return text
elif (chinese_support_config.options['transcription'] \
in ['Cantonese'] and not force) or (force and cantonese):
def clean(t):
'remove leading apostrophe'
if "'" == t[0]:
return t[1:]
return t
def separate_jyutping_sub(p):
return clean(p.group("one"))+" "+clean(p.group("two"))
text = jyutping_two_re.sub(separate_jyutping_sub, text)
text = jyutping_two_re.sub(separate_jyutping_sub, text)
return text
else:
return text
#!/Changes!
def simplify(text):
u'''Converts to simplified variants
'''
r = db.get_simplified(text)
return r
def traditional(text):
u'''Converts to traditional variants
'''
r = db.get_traditional(text)
return r
# Extra support functions and parameters
##################################################################
MS_translator_object = None
vowel_tone_dict = {
u'ā':1, u'ā':1, u'ɑ̄':1, u'ē':1, u'ī':1, u'ō':1, u'ū':1,
u'ǖ':1, u'Ā':1, u'Ē':1, u'Ī':1, u'Ō':1, u'Ū':1, u'Ǖ':1,
u'á':2, u'ɑ́':2, u'é':2, u'í':2, u'ó':2, u'ú':2, u'ǘ':2,
u'Á':2, u'É':2, u'Í':2, u'Ó':2, u'Ú':2, u'Ǘ':2,
u'ǎ':3, u'ɑ̌':3, u'ě':3, u'ǐ':3, u'ǒ':3, u'ǔ':3, u'ǚ':3,
u'Ǎ':3, u'Ě':3, u'Ǐ':3, u'Ǒ':3, u'Ǔ':3, u'Ǚ':3,
u'à':4, u'ɑ̀':4, u'è':4, u'ì':4, u'ò':4, u'ù':4, u'ǜ':4,
u'À':4, u'È':4, u'Ì':4, u'Ò':4, u'Ù':4, u'Ǜ':4
}
vowel_decorations = [
{ },
{ u'a':u'ā', u'e':u'ē', u'i':u'ī', u'o':u'ō', u'u':u'ū', u'ü':u'ǖ', u'v':u'ǖ'},
{ u'a':u'á', u'e':u'é', u'i':u'í', u'o':u'ó', u'u':u'ú', u'ü':u'ǘ', u'v':u'ǘ'},
{ u'a':u'ǎ', u'e':u'ě', u'i':u'ǐ', u'o':u'ǒ', u'u':u'ǔ', u'ü':u'ǚ', u'v':u'ǚ'},
{ u'a':u'à', u'e':u'è', u'i':u'ì', u'o':u'ò', u'u':u'ù', u'ü':u'ǜ', u'v':u'ǜ'},
{ u'a':u'a', u'e':u'e', u'i':u'i', u'o':u'o', u'u':u'u', u'ü':u'ü', u'v':u'ü'},
]
base_letters = {
u'ā':u'a', u'ē':u'e', u'ī':u'i', u'ō':u'o', u'ū':u'u', u'ǖ':u'ü',
u'á':u'a', u'é':u'e', u'í':u'i', u'ó':u'o', u'ú':u'u', u'ǘ':u'ü',
u'ǎ':u'a', u'ě':u'e', u'ǐ':u'i', u'ǒ':u'o', u'ǔ':u'u', u'ǚ':u'ü',
u'à':u'a', u'è':u'e', u'ì':u'i', u'ò':u'o', u'ù':u'u', u'ǜ':u'ü',
u'a':u'a', u'e':u'e', u'i':u'i', u'o':u'o', u'u':u'u', u'ü':u'ü',
}
accents = u'ɑ̄āĀáɑ́ǎɑ̌ÁǍàɑ̀ÀēĒéÉěĚèÈīĪíÍǐǏìÌōŌóÓǒǑòÒūŪúÚǔǓùÙǖǕǘǗǚǙǜǛ'
def pinyin_re_sub():
inits = u"zh|sh|ch|[bpmfdtnlgkhjqxrzscwy]"
finals = u"i[ōóǒòo]ng|[ūúǔùu]ng|[āáǎàa]ng|[ēéěèe]ng|i[āɑ̄áɑ́ɑ́ǎɑ̌àɑ̀aāáǎàa]ng|[īíǐìi]ng|i[āáǎàa]n|u[āáǎàa]n|[ōóǒòo]ng|[ēéěèe]r|i[āáǎàa]|i[ēéěèe]|i[āáǎàa]o|i[ūúǔùu]|[īíǐìi]n|u[āáǎàa]|u[ōóǒòo]|u[āáǎàa]i|u[īíǐìi]|[ūúǔùu]n|u[ēéěèe]|ü[ēéěèe]|v[ēéěèe]|i[ōóǒòo]|[āáǎàa]i|[ēéěèe]i|[āáǎàa]o|[ōóǒòo]u|[āáǎàa]n|[ēéěèe]n|[āáǎàa]|[ēéěèe]|[ōóǒòo]|[īíǐìi]|[ūúǔùu]|[ǖǘǚǜüv]"
standalones = u"'[āáǎàa]ng|'[ēéěèe]ng|'[ēéěèe]r|'[āáǎàa]i|'[ēéěèe]i|'[āáǎàa]o|'[ōóǒòo]u|'[āáǎàa]n|'[ēéěèe]n|'[āáǎàa]|'[ēéěèe]|'[ōóǒòo]"
return "(("+inits+")("+finals+")[1-5]?|("+standalones+")[1-5]?)"
pinyin_re = pinyin_re_sub()
pinyin_two_re = re.compile("(?P<one>"+pinyin_re+")(?P<two>"+pinyin_re+")", flags=re.I)
#!Changes! (These are the jyutping separating functions)
def jyutping_re_sub():
inits = u"ng|gw|kw|[bpmfdtnlgkhwzcsj]"
finals = u"i|ip|it|ik|im|in|ing|iu|yu|yut|yun|u|up|ut|uk|um|un|ung|ui|e|ep|et|ek|em|en|eng|ei|eu|eot|eon|eoi|oe|oet|oek|oeng|oei|o|ot|ok|om|on|ong|oi|ou|ap|at|ak|am|an|ang|ai|au|aa|aap|aat|aak|aam|aan|aang|aai|aau|m|ng"
standalones = u"'uk|'ung|'e|'ei|'oe|'o|'ok|'om|'on|'ong|'oi|'ou|'ap|'at|'ak|'am|'an|'ang|'ai|'au|'aa|'aap|'aat|'aak|'aam|'aan|'aang|'aai|'aau|'m|'ng"
return "(("+inits+")("+finals+")[1-6]?|("+standalones+")[1-6]?)"
jyutping_re = jyutping_re_sub()
jyutping_two_re = re.compile("(?P<one>"+jyutping_re+")(?P<two>"+jyutping_re+")", flags=re.I)
#!/Changes!
db = dictdb.DictDB()
bopomofo_notes = {
u"ˊ":"2", u"ˇ":"3",u"ˋ":"4", u"˙":"5"}
def extract_sound_tags(text):
sound_tags = re.findall(r"\[sound:.*?\]", text)
if [] == sound_tags:
sound_tags=""
else:
sound_tags = reduce(lambda a,b:a+b, sound_tags)
nosound = re.sub(r"\[sound:.*?\]", r"", text)
return nosound, sound_tags
def get_tone_number(pinyin):
if re.match(r".+1[0-9]$", pinyin):
return pinyin[-2:]
elif re.match(r".+[0-9]$", pinyin):
return pinyin[-1:]
elif re.match(u".+[¹²³⁴]$", pinyin):
return str(u" ¹²³⁴".index(pinyin[-1:]))
elif re.match(u"[\u3100-\u312F]", pinyin):#Bopomofo
if re.match(u"[ˊˇˋ˙]", pinyin[-1:]):
return str(u" ˊˇˋ˙".index(pinyin[-1:]))
else:
return "1"
else:
for c in pinyin:
try:
return str(vowel_tone_dict[c])
except KeyError:
continue
return "5"
def has_ruby(text):
return re.search(u"[\u3400-\u9fff]\[.+\]", text)
def has_hanzi(text):
return re.search(u"[\u3400-\u9fff]", text)
def get_character_transcription(hanzi, transcription=None):
if transcription == None:
transcription = chinese_support_config.options['transcription']
if "Pinyin" == transcription:
text = db.get_pinyin(hanzi)
elif "Pinyin (Taiwan)" == transcription:
text = db.get_pinyin(hanzi, taiwan=True)
elif "Cantonese" == transcription:
text = db.get_cantonese(hanzi)
elif "Bopomofo" == transcription:
text = db.get_pinyin(hanzi, taiwan=True)
text = bopomofo_module.bopomofo(no_accents(text))
else:
text = ""
return text
def add_diaeresis(text):
try:
return re.sub(u"v", u"ü", text)
except:
return ""
def local_dict_colorize(txt, ruby=True):
"""
Colorize text in the form :
"Hello is written 你好[ni3 hao]"
(as used in the local dictionaries)
"""
def _sub(p):
c = ""
hanzi = p.group(1)
pinyin = p.group(2)
pinyin = accentuate_pinyin(pinyin)
if ruby:
if 1 == hanzi.count("|"):
hanzi = hanzi.split("|")
c += colorize_fuse(hanzi[0], pinyin, True)
c += "|"
c += colorize_fuse(hanzi[1], pinyin, True)
else:
c += colorize_fuse(hanzi, pinyin, True)
else:
if 1 == hanzi.count("|"):
#Hanzi has 2 variants (traditional and simplified)
hanzi = hanzi.split("|")
c += colorize_fuse(hanzi[0], pinyin, False)
c += "|"
c += colorize_fuse(hanzi[1], pinyin, False)
else:
c += colorize_fuse(hanzi, pinyin, False)
c += "[" + colorize(pinyin) + "]"
return c
txt = re.sub(u"([\u3400-\u9fff|]+)\\[(.*?)\\]", _sub, txt)
return txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment