Skip to content

Instantly share code, notes, and snippets.

@IshitaTakeshi
Last active August 29, 2015 14:27
Show Gist options
  • Save IshitaTakeshi/dc09f91f9d6637f65ace to your computer and use it in GitHub Desktop.
Save IshitaTakeshi/dc09f91f9d6637f65ace to your computer and use it in GitHub Desktop.
svmlight / liblinear format file loader
# svmlight / liblinear format file loader
# The MIT License (MIT)
#
# Copyright (c) 2015 Ishita Takeshi
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
strip_line(line) = replace(strip(line), r"\s+", " ")
split_line(line) = split(line, " ")
ishash(s) = (s == "#")
function line_to_sparse_vector(line)
parsefloat(x) = parse(Float64, x)
parseint(x) = parse(Int64, x)
line = strip_line(line)
splitted = split_line(line)
label = parsefloat(splitted[1])
dict = Dict{Int64, Float64}()
for element in splitted[2:end]
if ishash(element)
break
end
pair = split(element, ":")
index, value = parseint(pair[1]), parsefloat(pair[2])
dict[index] = value
end
vector = sparsevec(dict)
return vector, label
end
function starts_with_comment(line)
line = strip_line(line)
splitted = split_line(line)
return ishash(splitted[1])
end
function load_svmlight_file(filename, ElementType=Float64)
convert_element(x) = convert(ElementType, x)
X = Array(SparseMatrixCSC, 0)
y = Array(Int64, 0)
open(filename) do file
for line in eachline(file)
if(starts_with_comment(line))
continue
end
vector, label = line_to_sparse_vector(line)
vector = map(convert_element, vector)
push!(X, vector)
push!(y, label)
end
end
return X, y
end
#X, y = load_svmlight_file(ARGS[1])
#println(X)
#println(y)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment