Skip to content

Instantly share code, notes, and snippets.

@ywzhang909
Last active April 26, 2022 05:59
Show Gist options
  • Save ywzhang909/5609fa364319651858164e779042f07a to your computer and use it in GitHub Desktop.
Save ywzhang909/5609fa364319651858164e779042f07a to your computer and use it in GitHub Desktop.
common string utils#string
#!pip install jieba, tqdm
from tqdm import tqdm
from typing import List, Dict
import jieba
import numpy as np
def load_char_vocab_and_corpus(train_data:List[str], min_count=2):
chars = dict()
corpus = []
for desc in tqdm(train_data):
for c in desc:
chars[c] = chars.get(c, 0) + 1
corpus.append(list(desc))
chars = {i: j for i, j in chars.items() if j >= min_count}
idx2char = {i + 2: j for i, j in enumerate(chars)} # 0: mask, 1: padding
char2idx = {j: i for i, j in idx2char.items()}
return char2idx, idx2char, corpus
def load_bichar_vocab_and_corpus(train_data:List[str], min_count=2):
bichars = dict()
corpus = []
for desc in tqdm(train_data):
bigrams = []
for i in range(len(desc)):
c = desc[i] + '</end>' if i == len(desc) - 1 else desc[i:i+2]
bigrams.append(c)
bichars[c] = bichars.get(c, 0) + 1
corpus.append(bigrams)
bichars = {i: j for i, j in bichars.items() if j >= min_count}
idx2bichar = {i + 2: j for i, j in enumerate(bichars)} # 0: mask, 1: padding
bichar2idx = {j: i for i, j in idx2bichar.items()}
return bichar2idx, idx2bichar, corpus
def load_word_vocab_and_corpus(train_data:List[str], min_count=2):
words = dict()
corpus = []
for desc in tqdm(train_data):
desc_cut = jieba.lcut(desc)
for w in desc_cut:
words[w] = words.get(w, 0) + 1
corpus.append(desc_cut)
words = {i: j for i, j in words.items() if j >= min_count}
idx2word = {i + 2: j for i, j in enumerate(words)} # 0: mask, 1: padding
word2idx = {j: i for i, j in idx2word.items()}
return word2idx, idx2word, corpus
def load_charpos_vocab_and_corpus(char2idx, train_data:List[str]):
"""build position aware character vocabulary by assign 4 positional tags: <B> <M> <E> <S>"""
charpos2idx = {'<B>': 2, '<M>': 3, '<E>': 4, '<S>': 5}
for c in char2idx.keys():
charpos2idx[c+'<B>'] = len(charpos2idx) + 2
charpos2idx[c+'<M>'] = len(charpos2idx) + 2
charpos2idx[c+'<E>'] = len(charpos2idx) + 2
charpos2idx[c+'<S>'] = len(charpos2idx) + 2
idx2charpos = dict((idx, c) for c, idx in charpos2idx.items())
corpus = []
for desc in tqdm(train_data):
desc_cut = jieba.lcut(desc)
desc_pos = []
for word in desc_cut:
if len(word) == 1:
desc_pos.append(word+'<S>') # single character as one word
else:
for i in range(len(word)):
if i == 0:
desc_pos.append(word[i]+'<B>') # begin
elif i == len(word) - 1:
desc_pos.append(word[i]+'<E>') # end
else:
desc_pos.append(word[i]+'<M>') # middle
corpus.append(desc_pos)
return charpos2idx, idx2charpos, corpus
def train_valid_split(train_data, random=False):
random_order = list(range(len(train_data)))
if random:
np.random.shuffle(random_order)
dev_data = [train_data[j] for i, j in enumerate(random_order) if i % 9 == 0]
train_data = [train_data[j] for i, j in enumerate(random_order) if i % 9 != 0]
return train_data, dev_data
'''
TODO:
"中俄""印俄"
"以色列帮助中国"
"美俄英法"
'''
# "前南斯拉夫""前苏联/俄罗斯""前南斯拉夫"
if s:
if s[0] == '前':
s = s[1:]
s = country_name_map.get(s, s)
s = s.strip('(').strip(')').strip('《').strip('》').strip('◎')
assert s != '国'
#deal with multi-countries context
# like '中、美和日' \ '美国中国'
s = s.replace('国','国、').strip('、')
s = s.replace('共和','[UNK]')
REGEX_PATTERN = r'[、|和|与|。|,|;|—|:|)|/|■|▼]'
s_list = re.split(REGEX_PATTERN, s)
s_tokens = []
for sub in s_list:
sub.replace('[UNK]','共和')
sub = country_string_dealer(sub)
s_tokens.append(sub)
s = SPLIT_TOKEN.join(s_tokens)
return s
import os
import numpy as np
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from fastText import train_unsupervised
from glove import Glove, Corpus
def load_glove_format(filename):
word_vectors = {}
embeddings_dim = -1
with open(filename, 'r') as f:
for line in f:
line = line.strip().split()
try:
word = line[0]
word_vector = np.array([float(v) for v in line[1:]])
except ValueError:
continue
if embeddings_dim == -1:
embeddings_dim = len(word_vector)
if len(word_vector) != embeddings_dim:
continue
word_vectors[word] = word_vector
assert all(len(vw) == embeddings_dim for vw in word_vectors.values())
return word_vectors, embeddings_dim
def load_pre_trained(load_filename, vocabulary=None):
word_vectors = {}
try:
model = KeyedVectors.load_word2vec_format(load_filename)
weights = model.wv.syn0
embedding_dim = weights.shape[1]
for k, v in model.wv.vocab.items():
word_vectors[k] = weights[v.index, :]
except ValueError:
word_vectors, embedding_dim = load_glove_format(load_filename)
if vocabulary is not None:
emb = np.zeros(shape=(len(vocabulary) + 2, embedding_dim), dtype='float32')
emb[1] = np.random.normal(0, 0.05, embedding_dim)
nb_unk = 0
for w, i in vocabulary.items():
if w not in word_vectors:
nb_unk += 1
emb[i, :] = np.random.normal(0, 0.05, embedding_dim)
else:
emb[i, :] = word_vectors[w]
print('Logging Info - From {} Embedding matrix created : {}, unknown tokens: {}'.format(load_filename, emb.shape,
nb_unk))
return emb
else:
print('Logging Info - Loading {} Embedding : {}'.format(load_filename, (len(word_vectors), embedding_dim)))
return word_vectors
def train_w2v(corpus, vocabulary, embedding_dim=300):
model = Word2Vec(corpus, vector_size=embedding_dim, min_count=1, window=5, sg=1, epochs=10)
weights = model.wv.syn0
d = dict([(k, v.index) for k, v in model.wv.vocab.items()])
emb = np.zeros(shape=(len(vocabulary) + 2, embedding_dim), dtype='float32') # 0 for mask, 1 for unknown token
emb[1] = np.random.normal(0, 0.05, embedding_dim)
nb_unk = 0
for w, i in vocabulary.items():
if w not in d:
nb_unk += 1
emb[i, :] = np.random.normal(0, 0.05, embedding_dim)
else:
emb[i, :] = weights[d[w], :]
print('Logging Info - Word2Vec Embedding matrix created: {}, unknown tokens: {}'.format(emb.shape, nb_unk))
return emb
# here we use a python implementation of Glove, but the official glove implementation of C version is also highly
# recommended: https://github.com/stanfordnlp/GloVe/blob/master/demo.sh
def train_glove(corpus, vocabulary, embedding_dim=300):
corpus_model = Corpus()
corpus_model.fit(corpus, window=10)
glove = Glove(no_components=embedding_dim, learning_rate=0.05)
glove.fit(corpus_model.matrix, epochs=10, no_threads=4, verbose=True)
glove.add_dictionary(corpus_model.dictionary)
emb = np.zeros(shape=(len(vocabulary) + 2, embedding_dim), dtype='float32') # 0 for mask, 1 for unknown token
emb[1] = np.random.normal(0, 0.05, embedding_dim)
nb_unk = 0
for w, i in vocabulary.items():
if w not in glove.dictionary:
nb_unk += 1
emb[i, :] = np.random.normal(0, 0.05, embedding_dim)
else:
emb[i, :] = glove.word_vectors[glove.dictionary[w]]
print('Logging Info - Glove Embedding matrix created: {}, unknown tokens: {}'.format(emb.shape, nb_unk))
return emb
def train_fasttext(corpus, vocabulary, embedding_dim=300):
corpus_file_path = 'fasttext_tmp_corpus.txt'
with open(corpus_file_path, 'w', encoding='utf8')as writer:
for sentence in corpus:
writer.write(' '.join(sentence) + '\n')
model = train_unsupervised(input=corpus_file_path, model='skipgram', epoch=10, minCount=1, wordNgrams=3,
dim=embedding_dim)
model_vocab = model.get_words()
emb = np.zeros(shape=(len(vocabulary) + 2, embedding_dim), dtype='float32') # 0 for mask, 1 for unknown token
emb[1] = np.random.normal(0, 0.05, embedding_dim)
nb_unk = 0
for w, i in vocabulary.items():
if w not in model_vocab:
nb_unk += 1
emb[i, :] = np.random.normal(0, 0.05, embedding_dim)
else:
emb[i, :] = model.get_word_vector(w)
print('Logging Info - Fasttext Embedding matrix created: {}, unknown tokens: {}'.format(emb.shape, nb_unk))
os.remove(corpus_file_path)
return emb
line = '俄罗斯战胜'
country_lookup = {
'俄': '俄罗斯',
'罗': '罗马尼亚'
}
for i in range(len(line)):
c = line[i]
if c in country_lookup:
c_country = country_lookup[c]
line = line.replace(c_country, '*'*len(c_country))
print(c_country)
'''
放在site-package下面
'''
# encoding=utf8
import sys
import importlib
importlib.reload(sys)
sys.setdefaultencoding('utf8')
'''
汉字的判断
汉字的unicode编码范围 u4e00 到 u9fa5。
'''
def is_chinese(uchar):
"""判断一个unicode是否是汉字"""
if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
return True
else:
return False
'''
数字0-9的判断
数字的unicode编码范围根据全角和半角,有两个不同区域,半角数字 u0030 到 u0039,全角数字 uff10 到 uff19。
'''
def is_number(uchar):
"""判断一个unicode是否是半角数字"""
if uchar >= u'\u0030' and uchar<=u'\u0039':
return True
else:
return False
def is_Qnumber(uchar):
"""判断一个unicode是否是全角数字"""
if uchar >= u'\uff10' and uchar <= u'\uff19':
return True
else:
return False
'''
大小写字母判断
字母的unicode编码根据字母大小写,以及全角和半角共有四个区域。
半角大写字母:u0041 - u005a ,半角小写字母:u0061 - u007a ;
全角大写字母:uff21 - uff3a , 全角小写字母:uff41 - uff5a 。
'''
def is_alphabet(uchar):
"""判断一个unicode是否是半角英文字母"""
if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
return True
else:
return False
def is_Qalphabet(uchar):
"""判断一个unicode是否是全角英文字母"""
if (uchar >= u'\uff21' and uchar <= u'\uff3a') or (uchar >= u'\uff41' and uchar <= u'\uff5a'):
return True
else:
return False
'''
非汉字和数字字母的判断
判断除汉字、数字0-9、字母之外的字符。
'''
def is_other(uchar):
"""判断是否非汉字,数字和英文字符"""
if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
return True
else:
return False
'''
全角和半角的转换
全角半角转换需要用到上面的数字、字母等判断。
1.所有半角转全角,不是半角范围直接返回,空格半角特殊单独处理,其它半角和全角对应公式:半角 = 全角 - 0xfee0
2.所有全角转半角,和前面正好相反,公式对应:全角 = 半角 + 0xfee0
'''
def B2Q(uchar):
"""单个字符 半角转全角"""
inside_code = ord(uchar)
if inside_code < 0x0020 or inside_code > 0x7e: # 不是半角字符就返回原来的字符
return uchar
if inside_code == 0x0020: # 除了空格其他的全角半角的公式为: 半角 = 全角 - 0xfee0
inside_code = 0x3000
else:
inside_code += 0xfee0
return chr(inside_code)
def Q2B(uchar):
"""单个字符 全角转半角"""
inside_code = ord(uchar)
if inside_code == 0x3000:
inside_code = 0x0020
else:
inside_code -= 0xfee0
if inside_code < 0x0020 or inside_code > 0x7e: #转完之后不是半角字符返回原来的字符
return uchar
return chr(inside_code)
def stringQ2B(ustring):
"""把字符串全角转半角"""
return "".join([Q2B(uchar) for uchar in ustring])
def stringpartQ2B(ustring):
"""把字符串中数字和字母全角转半角"""
return "".join([Q2B(uchar) if is_Qnumber(uchar) or is_Qalphabet(uchar) else uchar for uchar in ustring])
if __name__ =='__main__':
text = "电影《2012》讲述了2012年12月21日的世界末日,主人公Jack以及世界各国人民挣扎求生的经历,灾难面前,尽现人间百态。"
print("text原文:", text, sep="\n", end="\n")
text1 = stringQ2B(text)
print("全角转半角:", text1, sep="\n", end="\n")
text2 = stringpartQ2B(text)
print("数字字母全角转半角:", text2, sep="\n", end="\n")
from typing import List
import jieba_fast
tokens = {}
def preprocess(text:str) -> str:
text = text.strip()
return text
def tokenizer(texts, n_grams=[1,2], dedulplicate = True) -> List[str]:
tokens = []
if isinstance(texts, str):
texts = [texts]
for text in texts:
tokens.append(text)
text = preprocess(text)
# cut words by jieba
tokens += list(jieba_fast.cut_for_search(text))
# cut words by n-gram
for n in range(min(n_grams), max(n_grams)+1):
assert n>0
tokens += [text[i:i+n] for i in range(len(text)-n+1)]
return list(set(tokens)) if dedulplicate else tokens
if __name__ == '__main__':
text = '中美'
tokens = tokenizer(text, dedulplicate=False)
print(tokens)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment