Created
June 26, 2020 06:58
-
-
Save tejasvaidhyadev/21a092ff3fe1f2c146a60af44b9519c1 to your computer and use it in GitHub Desktop.
Experimental Implementation of Sentencepiece in julia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function load(path) | |
vocab = readlines(path) | |
vocabnew = split.(vocab , "\t") | |
vo = [] | |
for i in 1:30000 | |
vocab1 = vocabnew[i][1] | |
vocab1 = replace(vocab1,"▁"=>"_") | |
push!(vo,vocab1) | |
end | |
vocab1 = convert(Array{String,1},vo) | |
#vocab1 = vocab1[2:30001] | |
logprob = [] | |
for i in 1:30000 | |
logp = vocabnew[i][2] | |
push!(logprob,logp) | |
end | |
logp = convert(Array{String,1},logprob) | |
logp =parse.(Float64,logprob) | |
#logp = logp[2:end] | |
spm = Sentencepiecemodel(vocab1,logp) | |
return spm | |
end | |
# to get index of given string | |
function getindex(sp::Sentencepiecemodel,text) | |
findall(x->x==text, sp.vocab)[1] | |
end | |
""" | |
struct Nodes | |
text::String | |
score::Float32 | |
index::Int64 | |
start::Int | |
en::Int | |
end | |
Utility structure, To hold the results of the forward pass (the forward Viterbi lattice) | |
hold the token token string, score, vocabulary index, start and end character position | |
""" | |
struct Nodes | |
text::String | |
score::Float32 | |
index::Int64 | |
start::Int | |
en::Int | |
end | |
""" | |
decode_forward(sp::Sentencepiecemodel,text::String) | |
Return all possible ngrams generated from sequence of items, as an Array{String,1} | |
# Example | |
```julia-repl | |
julia> seq = ["To","be","or","not"] | |
julia> a = everygram(seq,min_len=1, max_len=-1) | |
10-element Array{Any,1}: | |
"or" | |
"not" | |
"To" | |
"be" | |
"or not" | |
"be or" | |
"be or not" | |
"To be or" | |
"To be or not" | |
``` | |
""" | |
function decode_forward(sp::Sentencepiecemodel,text::String) | |
results = Array{Nodes,1}(undef,length(text)) | |
scores = fill(-Inf ,length(text)) | |
scores[1] =0 | |
for char_end in 1:length(text) | |
for char_start in 1:char_end | |
if text[char_start:char_end] in sp.vocab | |
subtokenid = getindex(sp,text[char_start:char_end])[1] | |
local_score = scores[char_start]+ sp.logprob[subtokenid] | |
if local_score > scores[char_end] | |
results[char_end]=Nodes(text[char_start:char_end],local_score,subtokenid,char_start,char_end) | |
scores[char_end]=local_score | |
end | |
end | |
end | |
if scores[char_end] == -Inf | |
results[char_end] = Nodes(text[char_end-1:char_end],-Inf,1,char_end-1,char_end) | |
scores[char_end] =0 | |
end | |
if scores[char_end] == 0 | |
results[char_end] = Nodes(text[char_end:char_end],-Inf,1,char_end,char_end) | |
end | |
end | |
return(results) | |
end | |
""" | |
decode_forward(sp::Sentencepiecemodel,text::String) | |
Return all possible ngrams generated from sequence of items, as an Array{String,1} | |
""" | |
function Decode_backward1(sp::Sentencepiecemodel,nodes) | |
next_nodes=nodes[end] | |
best_seq =[] | |
while next_nodes.start > 1 | |
node_value = next_nodes | |
next_nodes = nodes[(node_value.start)-1] | |
push!(best_seq,node_value) | |
end | |
push!(best_seq,next_nodes) | |
return(best_seq) | |
end | |
""" | |
Tokenizer(sp::Sentencepiecemodel,text) | |
given spm path and text it tokenized you string | |
It does all the preprocessing step needed | |
""" | |
function Tokenizer(sp::Sentencepiecemodel,text) | |
tks=[] | |
text = replace(text," " => "_") | |
if text[1] != '_' | |
text = "_"*text | |
end | |
output = decode_forward(sp,text) | |
tokens = Decode_backward1(sp,output) | |
tokens = reverse(tokens) | |
for node in tokens | |
push!(tks,node.text) | |
end | |
tks = string.(tks) | |
return(tks) | |
end | |
""" | |
ids_from_tokens(tk::Array{String,1}) | |
given tokens it provide its indices | |
""" | |
function ids_from_tokens(tk) | |
idlist=[] | |
for i in tk | |
idx = getindex(spm,i) | |
push!(idlist,idx) | |
end | |
return convert.(Int,idlist) | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function load(path) | |
vocab = readlines(path) | |
vocabnew = split.(vocab , "\t") | |
vo = [] | |
for i in 1:30000 | |
vocab1 = vocabnew[i][1] | |
vocab1 = replace(vocab1,"▁"=>"_") | |
push!(vo,vocab1) | |
end | |
vocab1 = convert(Array{String,1},vo) | |
#vocab1 = vocab1[2:30001] | |
logprob = [] | |
for i in 1:30000 | |
logp = vocabnew[i][2] | |
push!(logprob,logp) | |
end | |
logp = convert(Array{String,1},logprob) | |
logp =parse.(Float64,logprob) | |
#logp = logp[2:end] | |
spm = Sentencepiecemodel(vocab1,logp) | |
return spm | |
end | |
# to get index of given string | |
function getindex(sp::Sentencepiecemodel,text) | |
findall(x->x==text, sp.vocab)[1] | |
end | |
""" | |
struct Nodes | |
text::String | |
score::Float32 | |
index::Int64 | |
start::Int | |
en::Int | |
end | |
Utility structure, To hold the results of the forward pass (the forward Viterbi lattice) | |
hold the token token string, score, vocabulary index, start and end character position | |
""" | |
struct Nodes | |
text::String | |
score::Float32 | |
index::Int64 | |
start::Int | |
en::Int | |
end | |
""" | |
decode_forward(sp::Sentencepiecemodel,text::String) | |
Return all possible ngrams generated from sequence of items, as an Array{String,1} | |
# Example | |
```julia-repl | |
julia> seq = ["To","be","or","not"] | |
julia> a = everygram(seq,min_len=1, max_len=-1) | |
10-element Array{Any,1}: | |
"or" | |
"not" | |
"To" | |
"be" | |
"or not" | |
"be or" | |
"be or not" | |
"To be or" | |
"To be or not" | |
``` | |
""" | |
function decode_forward(sp::Sentencepiecemodel,text::String) | |
results = Array{Nodes,1}(undef,length(text)) | |
scores = fill(-Inf ,length(text)) | |
scores[1] =0 | |
for char_end in 1:length(text) | |
for char_start in 1:char_end | |
if text[char_start:char_end] in sp.vocab | |
subtokenid = getindex(sp,text[char_start:char_end])[1] | |
local_score = scores[char_start]+ sp.logprob[subtokenid] | |
if local_score > scores[char_end] | |
results[char_end]=Nodes(text[char_start:char_end],local_score,subtokenid,char_start,char_end) | |
scores[char_end]=local_score | |
end | |
end | |
end | |
if scores[char_end] == -Inf | |
results[char_end] = Nodes(text[char_end-1:char_end],-Inf,1,char_end-1,char_end) | |
scores[char_end] =0 | |
end | |
if scores[char_end] == 0 | |
results[char_end] = Nodes(text[char_end:char_end],-Inf,1,char_end,char_end) | |
end | |
end | |
return(results) | |
end | |
""" | |
decode_forward(sp::Sentencepiecemodel,text::String) | |
Return all possible ngrams generated from sequence of items, as an Array{String,1} | |
""" | |
function Decode_backward1(sp::Sentencepiecemodel,nodes) | |
next_nodes=nodes[end] | |
best_seq =[] | |
while next_nodes.start > 1 | |
node_value = next_nodes | |
next_nodes = nodes[(node_value.start)-1] | |
push!(best_seq,node_value) | |
end | |
push!(best_seq,next_nodes) | |
return(best_seq) | |
end | |
""" | |
Tokenizer(sp::Sentencepiecemodel,text) | |
given spm path and text it tokenized you string | |
It does all the preprocessing step needed | |
""" | |
function Tokenizer(sp::Sentencepiecemodel,text) | |
tks=[] | |
text = replace(text," " => "_") | |
if text[1] != '_' | |
text = "_"*text | |
end | |
output = decode_forward(sp,text) | |
tokens = Decode_backward1(sp,output) | |
tokens = reverse(tokens) | |
for node in tokens | |
push!(tks,node.text) | |
end | |
tks = string.(tks) | |
return(tks) | |
end | |
""" | |
ids_from_tokens(tk::Array{String,1}) | |
given tokens it provide its indices | |
""" | |
function ids_from_tokens(tk) | |
idlist=[] | |
for i in tk | |
idx = getindex(spm,i) | |
push!(idlist,idx) | |
end | |
return convert.(Int,idlist) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment