Pure python char ngram tokenizers: sequence and generators
#%% | |
from collections import Counter, OrderedDict | |
from itertools import zip_longest , tee | |
## ngram iterators and tokenizers, working on list or generators | |
def ngram_tokenizer_iter(iterable, n, fillvalue=''): | |
"generuje pary znaków obok siebie, tokenizuje [abcd]->ab, cd dla tekstu przekazanego w formie generatora" | |
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" | |
args = [iter(iterable)] * n | |
zip_tuples = zip_longest(*args, fillvalue=fillvalue) | |
for tup in zip_tuples: | |
yield "".join(tup) | |
def ngram_tokenizer(ngrams): | |
'''generuje pary znaków obok siebie, tokenizuje [abcd]->ab, cd | |
''' | |
def func(text): | |
return (text[pos:pos + ngrams] for pos in range(0, len(text), ngrams)) | |
return func | |
def ngram_vocab_gen(ngrams): | |
'''generuje wszystkie ngramy [abcd]->ab, bc, cd, d | |
''' | |
def func(text): | |
return (text[i:i+ngrams] for i in range(len(text)+1-ngrams)) | |
return func | |
#analog dla ngram_vocab_gen można zrobić z | |
##https://stackoverflow.com/questions/5434891/iterate-a-list-as-pair-current-next-in-python | |
def ngram_vocab_gen_iter(iterator,ngrams): | |
"s -> (s0,s1,s2), (s1,s2,s3), (s2, s3,4), ..." | |
iter_tuple = tee(iterator, ngrams) | |
nested_iter_next=0 | |
list_of_iters = [] | |
for i,one_iter in enumerate(iter_tuple): | |
for _ in range(nested_iter_next): | |
next(one_iter,"") | |
list_of_iters.append(one_iter) | |
nested_iter_next+=1 | |
for tup in zip(*list_of_iters): | |
yield "".join(tup) | |
#%% | |
dataset_text = "aabbcc ddaaa aacca caca baaba baac " #dataset z znaków a,b,c aby było prościej :) | |
# dataset_text = "Twój długi tekst, najczęściej scalony cały dataset to jednego stringa, albo jego część, albo generator odczytujący z pliku linie po lini " | |
#%% All possible char-bi-grams | |
a=list(ngram_vocab_gen(2)(dataset_text)) | |
print(f'{len(a)} {a[0:10]}') | |
#%% | |
a=list(ngram_vocab_gen_iter(dataset_text,2)) | |
print(f'{len(a)} {a[0:10]}') | |
#%% bi-gram tokenizer | |
a=list(ngram_tokenizer_iter(dataset_text,2)) | |
print(f'{len(a)} {a[0:10]}') | |
#%% | |
a=list(ngram_tokenizer(2)(dataset_text)) | |
print(f'{len(a)} {a[0:10]}') | |
#%% testing speed and accuracy of ngram tokenization and ngram seed generation | |
import timeit | |
import numpy as np | |
SETUP_CODE = ''' | |
from __main__ import ngram_tokenizer_iter, ngram_tokenizer,ngram_vocab_gen, ngram_vocab_gen_iter | |
from __main__ import dataset_text | |
''' | |
CODE1=''' | |
a=list(ngram_tokenizer_iter(dataset_text,2)) | |
''' | |
CODE2=''' | |
a=list(ngram_tokenizer(2)(dataset_text)) | |
''' | |
CODE3=''' | |
a=list(ngram_vocab_gen(2)(dataset_text)) | |
''' | |
CODE4=''' | |
a=list(ngram_vocab_gen_iter(dataset_text,2)) | |
''' | |
print(f'{CODE1} time={np.mean(timeit.repeat(CODE1,SETUP_CODE, repeat=3,number=10))}') | |
print(f'{CODE2} time={np.mean(timeit.repeat(CODE2,SETUP_CODE, repeat=3,number=10))}') | |
print(f'{CODE3} time={np.mean(timeit.repeat(CODE3,SETUP_CODE, repeat=3,number=10))}') | |
print(f'{CODE4} time={np.mean(timeit.repeat(CODE4,SETUP_CODE, repeat=3,number=10))}') | |
# %% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment