pazzo83/tokenizer.jl

## tokenizer.jl
using TextAnalysis

mutable struct Tokenizer
    max_vocab::Int
    word_index::Dict{String, Int}
    index_word::Dict{Int, String}
    filters::Regex
    lower::Bool

    Tokenizer(
        max_vocab::Int = 0,
        filters::Regex = r"!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n",
        lower::Bool = true
    ) = new(max_vocab, Dict{String, Int}(), Dict{Int, String}(), filters, lower)
end

function fit_on_texts!(tokenizer::Tokenizer, texts::Vector{String})
    corpus = Corpus(StringDocument.(texts))
    if tokenizer.lower
        remove_case!(corpus)
    end
    remove_patterns!(corpus, tokenizer.filters)
    update_lexicon!(corpus)

    if tokenizer.max_vocab > 0
        vocab = getindex.(sort(collect(corpus.lexicon), by=x->x[2], rev=true), 1)[1:tokenizer.max_vocab]
        tokenizer.word_index = TextAnalysis.columnindices(vocab)
    else
        tokenizer.word_index = TextAnalysis.columnindices(sort(collect(keys(lexicon(corpus)))))
    end

    tokenizer.index_word = Dict(idx => word for (word, idx) = tokenizer.word_index)

    return tokenizer
end

function texts_to_sequences(tokenizer::Tokenizer, texts::Vector{String})
    corpus = Corpus(StringDocument.(texts))
    if tokenizer.lower
        remove_case!(corpus)
    end
    remove_patterns!(corpus, tokenizer.filters)

    sequences = Vector{Vector{Int}}(undef, length(corpus.documents))
    for i in eachindex(sequences)
        doc_tokens = tokens(corpus.documents[i])
        sequence = zeros(Int, length(doc_tokens))
        for k in eachindex(sequence)
            sequence[k] = get(tokenizer.word_index, doc_tokens[k], 0)
        end
        sequences[i] = sequence
    end
    return sequences
end

function sequences_to_texts(tokenizer, sequences::Vector{Vector{Int}})
    text_sequences = Vector{Vector{String}}(undef, length(sequences))
    for i = eachindex(sequences)
        text_sequences[i] = [tokenizer.index_word[w] for w = sequences[i]]
    end

    return text_sequences
end
	using TextAnalysis

	mutable struct Tokenizer
	max_vocab::Int
	word_index::Dict{String, Int}
	index_word::Dict{Int, String}
	filters::Regex
	lower::Bool

	Tokenizer(
	max_vocab::Int = 0,
	filters::Regex = r"!\"#$%&()*+,-./:;<=>?@[\\]^_`{\|}~\t\n",
	lower::Bool = true
	) = new(max_vocab, Dict{String, Int}(), Dict{Int, String}(), filters, lower)
	end

	function fit_on_texts!(tokenizer::Tokenizer, texts::Vector{String})
	corpus = Corpus(StringDocument.(texts))
	if tokenizer.lower
	remove_case!(corpus)
	end
	remove_patterns!(corpus, tokenizer.filters)
	update_lexicon!(corpus)

	if tokenizer.max_vocab > 0
	vocab = getindex.(sort(collect(corpus.lexicon), by=x->x[2], rev=true), 1)[1:tokenizer.max_vocab]
	tokenizer.word_index = TextAnalysis.columnindices(vocab)
	else
	tokenizer.word_index = TextAnalysis.columnindices(sort(collect(keys(lexicon(corpus)))))
	end

	tokenizer.index_word = Dict(idx => word for (word, idx) = tokenizer.word_index)

	return tokenizer
	end

	function texts_to_sequences(tokenizer::Tokenizer, texts::Vector{String})
	corpus = Corpus(StringDocument.(texts))
	if tokenizer.lower
	remove_case!(corpus)
	end
	remove_patterns!(corpus, tokenizer.filters)

	sequences = Vector{Vector{Int}}(undef, length(corpus.documents))
	for i in eachindex(sequences)
	doc_tokens = tokens(corpus.documents[i])
	sequence = zeros(Int, length(doc_tokens))
	for k in eachindex(sequence)
	sequence[k] = get(tokenizer.word_index, doc_tokens[k], 0)
	end
	sequences[i] = sequence
	end
	return sequences
	end

	function sequences_to_texts(tokenizer, sequences::Vector{Vector{Int}})
	text_sequences = Vector{Vector{String}}(undef, length(sequences))
	for i = eachindex(sequences)
	text_sequences[i] = [tokenizer.index_word[w] for w = sequences[i]]
	end

	return text_sequences
	end