Last active
April 26, 2022 05:59
-
-
Save ywzhang909/5609fa364319651858164e779042f07a to your computer and use it in GitHub Desktop.
common string utils#string
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!pip install jieba, tqdm | |
from tqdm import tqdm | |
from typing import List, Dict | |
import jieba | |
import numpy as np | |
def load_char_vocab_and_corpus(train_data:List[str], min_count=2): | |
chars = dict() | |
corpus = [] | |
for desc in tqdm(train_data): | |
for c in desc: | |
chars[c] = chars.get(c, 0) + 1 | |
corpus.append(list(desc)) | |
chars = {i: j for i, j in chars.items() if j >= min_count} | |
idx2char = {i + 2: j for i, j in enumerate(chars)} # 0: mask, 1: padding | |
char2idx = {j: i for i, j in idx2char.items()} | |
return char2idx, idx2char, corpus | |
def load_bichar_vocab_and_corpus(train_data:List[str], min_count=2): | |
bichars = dict() | |
corpus = [] | |
for desc in tqdm(train_data): | |
bigrams = [] | |
for i in range(len(desc)): | |
c = desc[i] + '</end>' if i == len(desc) - 1 else desc[i:i+2] | |
bigrams.append(c) | |
bichars[c] = bichars.get(c, 0) + 1 | |
corpus.append(bigrams) | |
bichars = {i: j for i, j in bichars.items() if j >= min_count} | |
idx2bichar = {i + 2: j for i, j in enumerate(bichars)} # 0: mask, 1: padding | |
bichar2idx = {j: i for i, j in idx2bichar.items()} | |
return bichar2idx, idx2bichar, corpus | |
def load_word_vocab_and_corpus(train_data:List[str], min_count=2): | |
words = dict() | |
corpus = [] | |
for desc in tqdm(train_data): | |
desc_cut = jieba.lcut(desc) | |
for w in desc_cut: | |
words[w] = words.get(w, 0) + 1 | |
corpus.append(desc_cut) | |
words = {i: j for i, j in words.items() if j >= min_count} | |
idx2word = {i + 2: j for i, j in enumerate(words)} # 0: mask, 1: padding | |
word2idx = {j: i for i, j in idx2word.items()} | |
return word2idx, idx2word, corpus | |
def load_charpos_vocab_and_corpus(char2idx, train_data:List[str]): | |
"""build position aware character vocabulary by assign 4 positional tags: <B> <M> <E> <S>""" | |
charpos2idx = {'<B>': 2, '<M>': 3, '<E>': 4, '<S>': 5} | |
for c in char2idx.keys(): | |
charpos2idx[c+'<B>'] = len(charpos2idx) + 2 | |
charpos2idx[c+'<M>'] = len(charpos2idx) + 2 | |
charpos2idx[c+'<E>'] = len(charpos2idx) + 2 | |
charpos2idx[c+'<S>'] = len(charpos2idx) + 2 | |
idx2charpos = dict((idx, c) for c, idx in charpos2idx.items()) | |
corpus = [] | |
for desc in tqdm(train_data): | |
desc_cut = jieba.lcut(desc) | |
desc_pos = [] | |
for word in desc_cut: | |
if len(word) == 1: | |
desc_pos.append(word+'<S>') # single character as one word | |
else: | |
for i in range(len(word)): | |
if i == 0: | |
desc_pos.append(word[i]+'<B>') # begin | |
elif i == len(word) - 1: | |
desc_pos.append(word[i]+'<E>') # end | |
else: | |
desc_pos.append(word[i]+'<M>') # middle | |
corpus.append(desc_pos) | |
return charpos2idx, idx2charpos, corpus | |
def train_valid_split(train_data, random=False): | |
random_order = list(range(len(train_data))) | |
if random: | |
np.random.shuffle(random_order) | |
dev_data = [train_data[j] for i, j in enumerate(random_order) if i % 9 == 0] | |
train_data = [train_data[j] for i, j in enumerate(random_order) if i % 9 != 0] | |
return train_data, dev_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
TODO: | |
"中俄""印俄" | |
"以色列帮助中国" | |
"美俄英法" | |
''' | |
# "前南斯拉夫""前苏联/俄罗斯""前南斯拉夫" | |
if s: | |
if s[0] == '前': | |
s = s[1:] | |
s = country_name_map.get(s, s) | |
s = s.strip('(').strip(')').strip('《').strip('》').strip('◎') | |
assert s != '国' | |
#deal with multi-countries context | |
# like '中、美和日' \ '美国中国' | |
s = s.replace('国','国、').strip('、') | |
s = s.replace('共和','[UNK]') | |
REGEX_PATTERN = r'[、|和|与|。|,|;|—|:|)|/|■|▼]' | |
s_list = re.split(REGEX_PATTERN, s) | |
s_tokens = [] | |
for sub in s_list: | |
sub.replace('[UNK]','共和') | |
sub = country_string_dealer(sub) | |
s_tokens.append(sub) | |
s = SPLIT_TOKEN.join(s_tokens) | |
return s |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import numpy as np | |
from gensim.models import Word2Vec | |
from gensim.models import KeyedVectors | |
from fastText import train_unsupervised | |
from glove import Glove, Corpus | |
def load_glove_format(filename): | |
word_vectors = {} | |
embeddings_dim = -1 | |
with open(filename, 'r') as f: | |
for line in f: | |
line = line.strip().split() | |
try: | |
word = line[0] | |
word_vector = np.array([float(v) for v in line[1:]]) | |
except ValueError: | |
continue | |
if embeddings_dim == -1: | |
embeddings_dim = len(word_vector) | |
if len(word_vector) != embeddings_dim: | |
continue | |
word_vectors[word] = word_vector | |
assert all(len(vw) == embeddings_dim for vw in word_vectors.values()) | |
return word_vectors, embeddings_dim | |
def load_pre_trained(load_filename, vocabulary=None): | |
word_vectors = {} | |
try: | |
model = KeyedVectors.load_word2vec_format(load_filename) | |
weights = model.wv.syn0 | |
embedding_dim = weights.shape[1] | |
for k, v in model.wv.vocab.items(): | |
word_vectors[k] = weights[v.index, :] | |
except ValueError: | |
word_vectors, embedding_dim = load_glove_format(load_filename) | |
if vocabulary is not None: | |
emb = np.zeros(shape=(len(vocabulary) + 2, embedding_dim), dtype='float32') | |
emb[1] = np.random.normal(0, 0.05, embedding_dim) | |
nb_unk = 0 | |
for w, i in vocabulary.items(): | |
if w not in word_vectors: | |
nb_unk += 1 | |
emb[i, :] = np.random.normal(0, 0.05, embedding_dim) | |
else: | |
emb[i, :] = word_vectors[w] | |
print('Logging Info - From {} Embedding matrix created : {}, unknown tokens: {}'.format(load_filename, emb.shape, | |
nb_unk)) | |
return emb | |
else: | |
print('Logging Info - Loading {} Embedding : {}'.format(load_filename, (len(word_vectors), embedding_dim))) | |
return word_vectors | |
def train_w2v(corpus, vocabulary, embedding_dim=300): | |
model = Word2Vec(corpus, vector_size=embedding_dim, min_count=1, window=5, sg=1, epochs=10) | |
weights = model.wv.syn0 | |
d = dict([(k, v.index) for k, v in model.wv.vocab.items()]) | |
emb = np.zeros(shape=(len(vocabulary) + 2, embedding_dim), dtype='float32') # 0 for mask, 1 for unknown token | |
emb[1] = np.random.normal(0, 0.05, embedding_dim) | |
nb_unk = 0 | |
for w, i in vocabulary.items(): | |
if w not in d: | |
nb_unk += 1 | |
emb[i, :] = np.random.normal(0, 0.05, embedding_dim) | |
else: | |
emb[i, :] = weights[d[w], :] | |
print('Logging Info - Word2Vec Embedding matrix created: {}, unknown tokens: {}'.format(emb.shape, nb_unk)) | |
return emb | |
# here we use a python implementation of Glove, but the official glove implementation of C version is also highly | |
# recommended: https://github.com/stanfordnlp/GloVe/blob/master/demo.sh | |
def train_glove(corpus, vocabulary, embedding_dim=300): | |
corpus_model = Corpus() | |
corpus_model.fit(corpus, window=10) | |
glove = Glove(no_components=embedding_dim, learning_rate=0.05) | |
glove.fit(corpus_model.matrix, epochs=10, no_threads=4, verbose=True) | |
glove.add_dictionary(corpus_model.dictionary) | |
emb = np.zeros(shape=(len(vocabulary) + 2, embedding_dim), dtype='float32') # 0 for mask, 1 for unknown token | |
emb[1] = np.random.normal(0, 0.05, embedding_dim) | |
nb_unk = 0 | |
for w, i in vocabulary.items(): | |
if w not in glove.dictionary: | |
nb_unk += 1 | |
emb[i, :] = np.random.normal(0, 0.05, embedding_dim) | |
else: | |
emb[i, :] = glove.word_vectors[glove.dictionary[w]] | |
print('Logging Info - Glove Embedding matrix created: {}, unknown tokens: {}'.format(emb.shape, nb_unk)) | |
return emb | |
def train_fasttext(corpus, vocabulary, embedding_dim=300): | |
corpus_file_path = 'fasttext_tmp_corpus.txt' | |
with open(corpus_file_path, 'w', encoding='utf8')as writer: | |
for sentence in corpus: | |
writer.write(' '.join(sentence) + '\n') | |
model = train_unsupervised(input=corpus_file_path, model='skipgram', epoch=10, minCount=1, wordNgrams=3, | |
dim=embedding_dim) | |
model_vocab = model.get_words() | |
emb = np.zeros(shape=(len(vocabulary) + 2, embedding_dim), dtype='float32') # 0 for mask, 1 for unknown token | |
emb[1] = np.random.normal(0, 0.05, embedding_dim) | |
nb_unk = 0 | |
for w, i in vocabulary.items(): | |
if w not in model_vocab: | |
nb_unk += 1 | |
emb[i, :] = np.random.normal(0, 0.05, embedding_dim) | |
else: | |
emb[i, :] = model.get_word_vector(w) | |
print('Logging Info - Fasttext Embedding matrix created: {}, unknown tokens: {}'.format(emb.shape, nb_unk)) | |
os.remove(corpus_file_path) | |
return emb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
line = '俄罗斯战胜' | |
country_lookup = { | |
'俄': '俄罗斯', | |
'罗': '罗马尼亚' | |
} | |
for i in range(len(line)): | |
c = line[i] | |
if c in country_lookup: | |
c_country = country_lookup[c] | |
line = line.replace(c_country, '*'*len(c_country)) | |
print(c_country) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
放在site-package下面 | |
''' | |
# encoding=utf8 | |
import sys | |
import importlib | |
importlib.reload(sys) | |
sys.setdefaultencoding('utf8') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
汉字的判断 | |
汉字的unicode编码范围 u4e00 到 u9fa5。 | |
''' | |
def is_chinese(uchar): | |
"""判断一个unicode是否是汉字""" | |
if uchar >= u'\u4e00' and uchar<=u'\u9fa5': | |
return True | |
else: | |
return False | |
''' | |
数字0-9的判断 | |
数字的unicode编码范围根据全角和半角,有两个不同区域,半角数字 u0030 到 u0039,全角数字 uff10 到 uff19。 | |
''' | |
def is_number(uchar): | |
"""判断一个unicode是否是半角数字""" | |
if uchar >= u'\u0030' and uchar<=u'\u0039': | |
return True | |
else: | |
return False | |
def is_Qnumber(uchar): | |
"""判断一个unicode是否是全角数字""" | |
if uchar >= u'\uff10' and uchar <= u'\uff19': | |
return True | |
else: | |
return False | |
''' | |
大小写字母判断 | |
字母的unicode编码根据字母大小写,以及全角和半角共有四个区域。 | |
半角大写字母:u0041 - u005a ,半角小写字母:u0061 - u007a ; | |
全角大写字母:uff21 - uff3a , 全角小写字母:uff41 - uff5a 。 | |
''' | |
def is_alphabet(uchar): | |
"""判断一个unicode是否是半角英文字母""" | |
if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'): | |
return True | |
else: | |
return False | |
def is_Qalphabet(uchar): | |
"""判断一个unicode是否是全角英文字母""" | |
if (uchar >= u'\uff21' and uchar <= u'\uff3a') or (uchar >= u'\uff41' and uchar <= u'\uff5a'): | |
return True | |
else: | |
return False | |
''' | |
非汉字和数字字母的判断 | |
判断除汉字、数字0-9、字母之外的字符。 | |
''' | |
def is_other(uchar): | |
"""判断是否非汉字,数字和英文字符""" | |
if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)): | |
return True | |
else: | |
return False | |
''' | |
全角和半角的转换 | |
全角半角转换需要用到上面的数字、字母等判断。 | |
1.所有半角转全角,不是半角范围直接返回,空格半角特殊单独处理,其它半角和全角对应公式:半角 = 全角 - 0xfee0 | |
2.所有全角转半角,和前面正好相反,公式对应:全角 = 半角 + 0xfee0 | |
''' | |
def B2Q(uchar): | |
"""单个字符 半角转全角""" | |
inside_code = ord(uchar) | |
if inside_code < 0x0020 or inside_code > 0x7e: # 不是半角字符就返回原来的字符 | |
return uchar | |
if inside_code == 0x0020: # 除了空格其他的全角半角的公式为: 半角 = 全角 - 0xfee0 | |
inside_code = 0x3000 | |
else: | |
inside_code += 0xfee0 | |
return chr(inside_code) | |
def Q2B(uchar): | |
"""单个字符 全角转半角""" | |
inside_code = ord(uchar) | |
if inside_code == 0x3000: | |
inside_code = 0x0020 | |
else: | |
inside_code -= 0xfee0 | |
if inside_code < 0x0020 or inside_code > 0x7e: #转完之后不是半角字符返回原来的字符 | |
return uchar | |
return chr(inside_code) | |
def stringQ2B(ustring): | |
"""把字符串全角转半角""" | |
return "".join([Q2B(uchar) for uchar in ustring]) | |
def stringpartQ2B(ustring): | |
"""把字符串中数字和字母全角转半角""" | |
return "".join([Q2B(uchar) if is_Qnumber(uchar) or is_Qalphabet(uchar) else uchar for uchar in ustring]) | |
if __name__ =='__main__': | |
text = "电影《2012》讲述了2012年12月21日的世界末日,主人公Jack以及世界各国人民挣扎求生的经历,灾难面前,尽现人间百态。" | |
print("text原文:", text, sep="\n", end="\n") | |
text1 = stringQ2B(text) | |
print("全角转半角:", text1, sep="\n", end="\n") | |
text2 = stringpartQ2B(text) | |
print("数字字母全角转半角:", text2, sep="\n", end="\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import List | |
import jieba_fast | |
tokens = {} | |
def preprocess(text:str) -> str: | |
text = text.strip() | |
return text | |
def tokenizer(texts, n_grams=[1,2], dedulplicate = True) -> List[str]: | |
tokens = [] | |
if isinstance(texts, str): | |
texts = [texts] | |
for text in texts: | |
tokens.append(text) | |
text = preprocess(text) | |
# cut words by jieba | |
tokens += list(jieba_fast.cut_for_search(text)) | |
# cut words by n-gram | |
for n in range(min(n_grams), max(n_grams)+1): | |
assert n>0 | |
tokens += [text[i:i+n] for i in range(len(text)-n+1)] | |
return list(set(tokens)) if dedulplicate else tokens | |
if __name__ == '__main__': | |
text = '中美' | |
tokens = tokenizer(text, dedulplicate=False) | |
print(tokens) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment