Skip to content

Instantly share code, notes, and snippets.

@tejasvaidhyadev
Created June 26, 2020 06:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tejasvaidhyadev/21a092ff3fe1f2c146a60af44b9519c1 to your computer and use it in GitHub Desktop.
Save tejasvaidhyadev/21a092ff3fe1f2c146a60af44b9519c1 to your computer and use it in GitHub Desktop.
Experimental Implementation of Sentencepiece in julia
function load(path)
vocab = readlines(path)
vocabnew = split.(vocab , "\t")
vo = []
for i in 1:30000
vocab1 = vocabnew[i][1]
vocab1 = replace(vocab1,"▁"=>"_")
push!(vo,vocab1)
end
vocab1 = convert(Array{String,1},vo)
#vocab1 = vocab1[2:30001]
logprob = []
for i in 1:30000
logp = vocabnew[i][2]
push!(logprob,logp)
end
logp = convert(Array{String,1},logprob)
logp =parse.(Float64,logprob)
#logp = logp[2:end]
spm = Sentencepiecemodel(vocab1,logp)
return spm
end
# to get index of given string
function getindex(sp::Sentencepiecemodel,text)
findall(x->x==text, sp.vocab)[1]
end
"""
struct Nodes
text::String
score::Float32
index::Int64
start::Int
en::Int
end
Utility structure, To hold the results of the forward pass (the forward Viterbi lattice)
hold the token token string, score, vocabulary index, start and end character position
"""
struct Nodes
text::String
score::Float32
index::Int64
start::Int
en::Int
end
"""
decode_forward(sp::Sentencepiecemodel,text::String)
Return all possible ngrams generated from sequence of items, as an Array{String,1}
# Example
```julia-repl
julia> seq = ["To","be","or","not"]
julia> a = everygram(seq,min_len=1, max_len=-1)
10-element Array{Any,1}:
"or"
"not"
"To"
"be"
"or not"
"be or"
"be or not"
"To be or"
"To be or not"
```
"""
function decode_forward(sp::Sentencepiecemodel,text::String)
results = Array{Nodes,1}(undef,length(text))
scores = fill(-Inf ,length(text))
scores[1] =0
for char_end in 1:length(text)
for char_start in 1:char_end
if text[char_start:char_end] in sp.vocab
subtokenid = getindex(sp,text[char_start:char_end])[1]
local_score = scores[char_start]+ sp.logprob[subtokenid]
if local_score > scores[char_end]
results[char_end]=Nodes(text[char_start:char_end],local_score,subtokenid,char_start,char_end)
scores[char_end]=local_score
end
end
end
if scores[char_end] == -Inf
results[char_end] = Nodes(text[char_end-1:char_end],-Inf,1,char_end-1,char_end)
scores[char_end] =0
end
if scores[char_end] == 0
results[char_end] = Nodes(text[char_end:char_end],-Inf,1,char_end,char_end)
end
end
return(results)
end
"""
decode_forward(sp::Sentencepiecemodel,text::String)
Return all possible ngrams generated from sequence of items, as an Array{String,1}
"""
function Decode_backward1(sp::Sentencepiecemodel,nodes)
next_nodes=nodes[end]
best_seq =[]
while next_nodes.start > 1
node_value = next_nodes
next_nodes = nodes[(node_value.start)-1]
push!(best_seq,node_value)
end
push!(best_seq,next_nodes)
return(best_seq)
end
"""
Tokenizer(sp::Sentencepiecemodel,text)
given spm path and text it tokenized you string
It does all the preprocessing step needed
"""
function Tokenizer(sp::Sentencepiecemodel,text)
tks=[]
text = replace(text," " => "_")
if text[1] != '_'
text = "_"*text
end
output = decode_forward(sp,text)
tokens = Decode_backward1(sp,output)
tokens = reverse(tokens)
for node in tokens
push!(tks,node.text)
end
tks = string.(tks)
return(tks)
end
"""
ids_from_tokens(tk::Array{String,1})
given tokens it provide its indices
"""
function ids_from_tokens(tk)
idlist=[]
for i in tk
idx = getindex(spm,i)
push!(idlist,idx)
end
return convert.(Int,idlist)
end
function load(path)
vocab = readlines(path)
vocabnew = split.(vocab , "\t")
vo = []
for i in 1:30000
vocab1 = vocabnew[i][1]
vocab1 = replace(vocab1,"▁"=>"_")
push!(vo,vocab1)
end
vocab1 = convert(Array{String,1},vo)
#vocab1 = vocab1[2:30001]
logprob = []
for i in 1:30000
logp = vocabnew[i][2]
push!(logprob,logp)
end
logp = convert(Array{String,1},logprob)
logp =parse.(Float64,logprob)
#logp = logp[2:end]
spm = Sentencepiecemodel(vocab1,logp)
return spm
end
# to get index of given string
function getindex(sp::Sentencepiecemodel,text)
findall(x->x==text, sp.vocab)[1]
end
"""
struct Nodes
text::String
score::Float32
index::Int64
start::Int
en::Int
end
Utility structure, To hold the results of the forward pass (the forward Viterbi lattice)
hold the token token string, score, vocabulary index, start and end character position
"""
struct Nodes
text::String
score::Float32
index::Int64
start::Int
en::Int
end
"""
decode_forward(sp::Sentencepiecemodel,text::String)
Return all possible ngrams generated from sequence of items, as an Array{String,1}
# Example
```julia-repl
julia> seq = ["To","be","or","not"]
julia> a = everygram(seq,min_len=1, max_len=-1)
10-element Array{Any,1}:
"or"
"not"
"To"
"be"
"or not"
"be or"
"be or not"
"To be or"
"To be or not"
```
"""
function decode_forward(sp::Sentencepiecemodel,text::String)
results = Array{Nodes,1}(undef,length(text))
scores = fill(-Inf ,length(text))
scores[1] =0
for char_end in 1:length(text)
for char_start in 1:char_end
if text[char_start:char_end] in sp.vocab
subtokenid = getindex(sp,text[char_start:char_end])[1]
local_score = scores[char_start]+ sp.logprob[subtokenid]
if local_score > scores[char_end]
results[char_end]=Nodes(text[char_start:char_end],local_score,subtokenid,char_start,char_end)
scores[char_end]=local_score
end
end
end
if scores[char_end] == -Inf
results[char_end] = Nodes(text[char_end-1:char_end],-Inf,1,char_end-1,char_end)
scores[char_end] =0
end
if scores[char_end] == 0
results[char_end] = Nodes(text[char_end:char_end],-Inf,1,char_end,char_end)
end
end
return(results)
end
"""
decode_forward(sp::Sentencepiecemodel,text::String)
Return all possible ngrams generated from sequence of items, as an Array{String,1}
"""
function Decode_backward1(sp::Sentencepiecemodel,nodes)
next_nodes=nodes[end]
best_seq =[]
while next_nodes.start > 1
node_value = next_nodes
next_nodes = nodes[(node_value.start)-1]
push!(best_seq,node_value)
end
push!(best_seq,next_nodes)
return(best_seq)
end
"""
Tokenizer(sp::Sentencepiecemodel,text)
given spm path and text it tokenized you string
It does all the preprocessing step needed
"""
function Tokenizer(sp::Sentencepiecemodel,text)
tks=[]
text = replace(text," " => "_")
if text[1] != '_'
text = "_"*text
end
output = decode_forward(sp,text)
tokens = Decode_backward1(sp,output)
tokens = reverse(tokens)
for node in tokens
push!(tks,node.text)
end
tks = string.(tks)
return(tks)
end
"""
ids_from_tokens(tk::Array{String,1})
given tokens it provide its indices
"""
function ids_from_tokens(tk)
idlist=[]
for i in tk
idx = getindex(spm,i)
push!(idlist,idx)
end
return convert.(Int,idlist)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment