Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Pure python char ngram tokenizers: sequence and generators
#%%
from collections import Counter, OrderedDict
from itertools import zip_longest , tee
## ngram iterators and tokenizers, working on list or generators
def ngram_tokenizer_iter(iterable, n, fillvalue=''):
"generuje pary znaków obok siebie, tokenizuje [abcd]->ab, cd dla tekstu przekazanego w formie generatora"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
zip_tuples = zip_longest(*args, fillvalue=fillvalue)
for tup in zip_tuples:
yield "".join(tup)
def ngram_tokenizer(ngrams):
'''generuje pary znaków obok siebie, tokenizuje [abcd]->ab, cd
'''
def func(text):
return (text[pos:pos + ngrams] for pos in range(0, len(text), ngrams))
return func
def ngram_vocab_gen(ngrams):
'''generuje wszystkie ngramy [abcd]->ab, bc, cd, d
'''
def func(text):
return (text[i:i+ngrams] for i in range(len(text)+1-ngrams))
return func
#analog dla ngram_vocab_gen można zrobić z
##https://stackoverflow.com/questions/5434891/iterate-a-list-as-pair-current-next-in-python
def ngram_vocab_gen_iter(iterator,ngrams):
"s -> (s0,s1,s2), (s1,s2,s3), (s2, s3,4), ..."
iter_tuple = tee(iterator, ngrams)
nested_iter_next=0
list_of_iters = []
for i,one_iter in enumerate(iter_tuple):
for _ in range(nested_iter_next):
next(one_iter,"")
list_of_iters.append(one_iter)
nested_iter_next+=1
for tup in zip(*list_of_iters):
yield "".join(tup)
#%%
dataset_text = "aabbcc ddaaa aacca caca baaba baac " #dataset z znaków a,b,c aby było prościej :)
# dataset_text = "Twój długi tekst, najczęściej scalony cały dataset to jednego stringa, albo jego część, albo generator odczytujący z pliku linie po lini "
#%% All possible char-bi-grams
a=list(ngram_vocab_gen(2)(dataset_text))
print(f'{len(a)} {a[0:10]}')
#%%
a=list(ngram_vocab_gen_iter(dataset_text,2))
print(f'{len(a)} {a[0:10]}')
#%% bi-gram tokenizer
a=list(ngram_tokenizer_iter(dataset_text,2))
print(f'{len(a)} {a[0:10]}')
#%%
a=list(ngram_tokenizer(2)(dataset_text))
print(f'{len(a)} {a[0:10]}')
#%% testing speed and accuracy of ngram tokenization and ngram seed generation
import timeit
import numpy as np
SETUP_CODE = '''
from __main__ import ngram_tokenizer_iter, ngram_tokenizer,ngram_vocab_gen, ngram_vocab_gen_iter
from __main__ import dataset_text
'''
CODE1='''
a=list(ngram_tokenizer_iter(dataset_text,2))
'''
CODE2='''
a=list(ngram_tokenizer(2)(dataset_text))
'''
CODE3='''
a=list(ngram_vocab_gen(2)(dataset_text))
'''
CODE4='''
a=list(ngram_vocab_gen_iter(dataset_text,2))
'''
print(f'{CODE1} time={np.mean(timeit.repeat(CODE1,SETUP_CODE, repeat=3,number=10))}')
print(f'{CODE2} time={np.mean(timeit.repeat(CODE2,SETUP_CODE, repeat=3,number=10))}')
print(f'{CODE3} time={np.mean(timeit.repeat(CODE3,SETUP_CODE, repeat=3,number=10))}')
print(f'{CODE4} time={np.mean(timeit.repeat(CODE4,SETUP_CODE, repeat=3,number=10))}')
# %%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment