Instantly share code, notes, and snippets.

# yegortk/L1Convex.jl

Last active January 5, 2024 02:05
Show Gist options
• Save yegortk/ce18975200e7dffd1759125972cd54f4 to your computer and use it in GitHub Desktop.
Julia implementation of OWL-QN algorithm for large-scale L1-regularized sparse convex optimization
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 module L1Convex # This is a Julia module implementing OWL-QN algorithm for large-scale L1-regularized convex optimization # https://www.microsoft.com/en-us/research/wp-content/uploads/2007/01/andrew07scalable.pdf # http://proceedings.mlr.press/v37/gonga15.pdf # The algorithm minimizes loss of the form: f(x) + lambda * ||x||_1 # where f(x) is a differentiable convex function # ||x||_1 is the L1 norm of the parameter vector x # lambda is the regularization strength # L-BFGS component implementation is based on: # https://en.wikipedia.org/wiki/Limited-memory_BFGS # https://mitpress.mit.edu/books/algorithms-optimization # https://bitbucket.org/rtaylor/pylbfgs/src/master/ # Author: Yegor Tkachenko (2022) export OWLQN, step! # libary to support ⋅ vector dot product notation using LinearAlgebra # struct with key algorithm parameters mutable struct OWLQN s # param_t+1 - param_t [max size of m] y # grad_t+1 - grad_t [max size of m] rho # 1/s]i]'y[i] m::Int # L-BFGS history length t::Int # iteration lambda::Float64 # L1 penalty end # constructor function OWLQN(x::Vector{Float64}; lambda = 1.0) s = [] y = [] rho = [] m = 6 t = 0 OWLQN(s, y, rho, m, t, lambda) end # projected gradient based on raw gradient, parameter values, and L1 reg. strength function pseudo_gradient(g::Vector{Float64}, x::Vector{Float64}, lambda::Float64) pg = zeros(size(g)) for i in 1:size(g)[1] if x[i] > 0 pg[i] = g[i] + lambda elseif x[i] < 0 pg[i] = g[i] - lambda else if g[i] + lambda < 0 pg[i] = g[i] + lambda elseif g[i] - lambda > 0 pg[i] = g[i] - lambda end end end return pg end # pi alignment operator - projection of a on orthat defined by b function project!(a::Vector{Float64}, b::Vector{Float64}) for i in 1:size(a)[1] if sign(a[i]) != sign(b[i]) a[i] = 0.0 end end end # projected backtracking line search function projected_backtracking_line_search_update(f::Function, pg::Vector{Float64}, x::Vector{Float64}, z::Vector{Float64}, lambda::Float64; alpha=1.0, beta=0.5, gamma=1e-4) y = f(x) + lambda*sum(abs.(x)) # choose orthant for the new point xi = sign.(x) for i in 1:size(xi)[1] if xi[i] == 0 xi[i] = sign(-pg[i]) end end while true # update current point xt = x - alpha * z # project point onto orthant project!(xt, xi) # sufficient decrease condition if f(xt) + lambda*sum(abs.(xt)) <= y + gamma * (pg ⋅ (xt - x)) x .= xt break end # update step size alpha *= beta end end # single update of parameters x function step!(M::OWLQN, f::Function, ∇f::Function, x::Vector{Float64}) s, y, rho, lambda, g = M.s, M.y, M.rho, M.lambda, ∇f(x) if all(g .== 0.0) println("(Local) minimum found: ∇f(x) == 0") return end m = min(M.m, size(s)[1]) M.t += 1 x_copy = deepcopy(x) g_copy = deepcopy(g) pg = pseudo_gradient(g, x, lambda) Q = deepcopy(pg) if m > 0 # L-BFGS computation of Hessian-scaled gradient z = H_inv * g alpha = [] for i in m : -1 : 1 push!(alpha, rho[i] * (s[i] ⋅ Q)) Q -= alpha[end] * y[i] end reverse!(alpha) z = Q .* (s[end] ⋅ y[end]) / (y[end] ⋅ y[end]) for i in 1 : m z += s[i] * (alpha[i] - rho[i] * (y[i] ⋅ z)) end # zeroing out all elements in z if sign(z[i]) != sign(g[i]) # that is, if scaling changes gradient sign project!(z, pg) # fancy way to do x .-= z projected_backtracking_line_search_update(f, pg, x, z, lambda) else # fancy way to do x .-= g projected_backtracking_line_search_update(f, pg, x, pg, lambda, alpha = 1 / sqrt(pg ⋅ pg)) end push!(s, x - x_copy) push!(y, ∇f(x) - g_copy) push!(rho, 1/(y[end] ⋅ s[end])) while length(s) > M.m popfirst!(s) popfirst!(y) popfirst!(rho) end end end

### yegortk commented May 30, 2022

To use `L1Convex` module

```using HTTP: request

"https://gist.githubusercontent.com/yegortk/ce18975200e7dffd1759125972cd54f4/raw/4d4d4543c5b3266a891a67c780f3f2687c89a579/L1Convex.jl" |>
url -> request("GET", url) |>
res -> String(res.body) |>
str -> include_string(Main, str)

using .L1Convex```

### yegortk commented May 30, 2022 • edited

```### EXAMPLE ###

using .L1Convex

using Statistics
using Random
Random.seed!(111)

N = 100
X = randn(N, 3)
y = X * [2, -2, 0.0] .+ randn(N) .+ 0.1
X = hcat(X, ones(N)) # add intercept term

# convex function to minimize (here, MSE)
# note that this function should not include L1 penalty, OWL-QN algorithm applies L1 penalty during gradient update step
# parameters beta must be a 1D vector
# in this implementation, all elements of beta are L1 regularized (e.g., including intercept parameter)
function f(beta::Vector{Float64})
diff = y - X * beta;
sum(diff.^2.0) / size(y)[1]
end

function ∇f(beta::Vector{Float64})
-2 .* X' * (y - X * beta) / size(y)[1]
end

# initialize parameters
beta = ones(size(X)[2])

# optimization with lambda = 0.2 regularization strength
lambda = 0.2
M = OWLQN(beta, lambda=lambda);

for i in 1:100
step!(M, f, ∇f, beta);

mse = f(beta);
nrm = sum(abs.(beta))
loss = mse + lambda * nrm

print(string("Iteration: ", i, " Loss: ", loss, " MSE: ", mse, "\n"))
end

# OWL-QN
print(string("OWL-QN L1-regularized solution: ", round.(beta, digits=3),"\n"))

# OLS
print(string("OLS solution: ", round.(inv(X' * X) * X' * y, digits=3),"\n"))```