Skip to content

Instantly share code, notes, and snippets.

@pazzo83
pazzo83 / tokenizer.jl
Created August 28, 2021 19:46
pure julia tokenizer similar to Tokenizer in Keras
using TextAnalysis
mutable struct Tokenizer
max_vocab::Int
word_index::Dict{String, Int}
index_word::Dict{Int, String}
filters::Regex
lower::Bool
Tokenizer(
using MLJModelInterface, MLJBase, TSVD
MLJModelInterface.@mlj_model mutable struct TSVDTransformer <: MLJModelInterface.Unsupervised
nvals::Int = 2
maxiter::Int = 1000
end
struct TSVDTransformerResult
singular_values::Vector{Float64}
components::Matrix{Float64}
@pazzo83
pazzo83 / tfidf_transformer_mlj.jl
Last active July 21, 2021 04:55
TFIDF transformer for MLJ
using MLJModelInterface, MLJBase, TextAnalysis, SparseArrays
MLJModelInterface.@mlj_model mutable struct TfidfTransformer <: MLJModelInterface.Unsupervised
end
struct TfidfTransformerResult
vocab::Vector{String}
idf_vector::Vector{Float64}
end