Skip to content

Instantly share code, notes, and snippets.

@pazzo83
Created August 28, 2021 19:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pazzo83/0d8419764cf5f51fa0bfd3be12a64ea2 to your computer and use it in GitHub Desktop.
Save pazzo83/0d8419764cf5f51fa0bfd3be12a64ea2 to your computer and use it in GitHub Desktop.
pure julia tokenizer similar to Tokenizer in Keras
using TextAnalysis
mutable struct Tokenizer
max_vocab::Int
word_index::Dict{String, Int}
index_word::Dict{Int, String}
filters::Regex
lower::Bool
Tokenizer(
max_vocab::Int = 0,
filters::Regex = r"!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n",
lower::Bool = true
) = new(max_vocab, Dict{String, Int}(), Dict{Int, String}(), filters, lower)
end
function fit_on_texts!(tokenizer::Tokenizer, texts::Vector{String})
corpus = Corpus(StringDocument.(texts))
if tokenizer.lower
remove_case!(corpus)
end
remove_patterns!(corpus, tokenizer.filters)
update_lexicon!(corpus)
if tokenizer.max_vocab > 0
vocab = getindex.(sort(collect(corpus.lexicon), by=x->x[2], rev=true), 1)[1:tokenizer.max_vocab]
tokenizer.word_index = TextAnalysis.columnindices(vocab)
else
tokenizer.word_index = TextAnalysis.columnindices(sort(collect(keys(lexicon(corpus)))))
end
tokenizer.index_word = Dict(idx => word for (word, idx) = tokenizer.word_index)
return tokenizer
end
function texts_to_sequences(tokenizer::Tokenizer, texts::Vector{String})
corpus = Corpus(StringDocument.(texts))
if tokenizer.lower
remove_case!(corpus)
end
remove_patterns!(corpus, tokenizer.filters)
sequences = Vector{Vector{Int}}(undef, length(corpus.documents))
for i in eachindex(sequences)
doc_tokens = tokens(corpus.documents[i])
sequence = zeros(Int, length(doc_tokens))
for k in eachindex(sequence)
sequence[k] = get(tokenizer.word_index, doc_tokens[k], 0)
end
sequences[i] = sequence
end
return sequences
end
function sequences_to_texts(tokenizer, sequences::Vector{Vector{Int}})
text_sequences = Vector{Vector{String}}(undef, length(sequences))
for i = eachindex(sequences)
text_sequences[i] = [tokenizer.index_word[w] for w = sequences[i]]
end
return text_sequences
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment