Skip to content

Instantly share code, notes, and snippets.

@yegortk
Last active January 5, 2024 02:05
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yegortk/ce18975200e7dffd1759125972cd54f4 to your computer and use it in GitHub Desktop.
Save yegortk/ce18975200e7dffd1759125972cd54f4 to your computer and use it in GitHub Desktop.
Julia implementation of OWL-QN algorithm for large-scale L1-regularized sparse convex optimization
module L1Convex
# This is a Julia module implementing OWL-QN algorithm for large-scale L1-regularized convex optimization
# https://www.microsoft.com/en-us/research/wp-content/uploads/2007/01/andrew07scalable.pdf
# http://proceedings.mlr.press/v37/gonga15.pdf
# The algorithm minimizes loss of the form: f(x) + lambda * ||x||_1
# where f(x) is a differentiable convex function
# ||x||_1 is the L1 norm of the parameter vector x
# lambda is the regularization strength
# L-BFGS component implementation is based on:
# https://en.wikipedia.org/wiki/Limited-memory_BFGS
# https://mitpress.mit.edu/books/algorithms-optimization
# https://bitbucket.org/rtaylor/pylbfgs/src/master/
# Author: Yegor Tkachenko (2022)
export OWLQN, step!
# libary to support ⋅ vector dot product notation
using LinearAlgebra
# struct with key algorithm parameters
mutable struct OWLQN
s # param_t+1 - param_t [max size of m]
y # grad_t+1 - grad_t [max size of m]
rho # 1/s]i]'y[i]
m::Int # L-BFGS history length
t::Int # iteration
lambda::Float64 # L1 penalty
end
# constructor
function OWLQN(x::Vector{Float64}; lambda = 1.0)
s = []
y = []
rho = []
m = 6
t = 0
OWLQN(s, y, rho, m, t, lambda)
end
# projected gradient based on raw gradient, parameter values, and L1 reg. strength
function pseudo_gradient(g::Vector{Float64}, x::Vector{Float64}, lambda::Float64)
pg = zeros(size(g))
for i in 1:size(g)[1]
if x[i] > 0
pg[i] = g[i] + lambda
elseif x[i] < 0
pg[i] = g[i] - lambda
else
if g[i] + lambda < 0
pg[i] = g[i] + lambda
elseif g[i] - lambda > 0
pg[i] = g[i] - lambda
end
end
end
return pg
end
# pi alignment operator - projection of a on orthat defined by b
function project!(a::Vector{Float64}, b::Vector{Float64})
for i in 1:size(a)[1]
if sign(a[i]) != sign(b[i])
a[i] = 0.0
end
end
end
# projected backtracking line search
function projected_backtracking_line_search_update(f::Function, pg::Vector{Float64}, x::Vector{Float64}, z::Vector{Float64}, lambda::Float64; alpha=1.0, beta=0.5, gamma=1e-4)
y = f(x) + lambda*sum(abs.(x))
# choose orthant for the new point
xi = sign.(x)
for i in 1:size(xi)[1]
if xi[i] == 0
xi[i] = sign(-pg[i])
end
end
while true
# update current point
xt = x - alpha * z
# project point onto orthant
project!(xt, xi)
# sufficient decrease condition
if f(xt) + lambda*sum(abs.(xt)) <= y + gamma * (pg ⋅ (xt - x))
x .= xt
break
end
# update step size
alpha *= beta
end
end
# single update of parameters x
function step!(M::OWLQN, f::Function, ∇f::Function, x::Vector{Float64})
s, y, rho, lambda, g = M.s, M.y, M.rho, M.lambda, ∇f(x)
if all(g .== 0.0)
println("(Local) minimum found: ∇f(x) == 0")
return
end
m = min(M.m, size(s)[1])
M.t += 1
x_copy = deepcopy(x)
g_copy = deepcopy(g)
pg = pseudo_gradient(g, x, lambda)
Q = deepcopy(pg)
if m > 0
# L-BFGS computation of Hessian-scaled gradient z = H_inv * g
alpha = []
for i in m : -1 : 1
push!(alpha, rho[i] * (s[i] ⋅ Q))
Q -= alpha[end] * y[i]
end
reverse!(alpha)
z = Q .* (s[end] ⋅ y[end]) / (y[end] ⋅ y[end])
for i in 1 : m
z += s[i] * (alpha[i] - rho[i] * (y[i] ⋅ z))
end
# zeroing out all elements in z if sign(z[i]) != sign(g[i])
# that is, if scaling changes gradient sign
project!(z, pg)
# fancy way to do x .-= z
projected_backtracking_line_search_update(f, pg, x, z, lambda)
else
# fancy way to do x .-= g
projected_backtracking_line_search_update(f, pg, x, pg, lambda, alpha = 1 / sqrt(pg ⋅ pg))
end
push!(s, x - x_copy)
push!(y, ∇f(x) - g_copy)
push!(rho, 1/(y[end] ⋅ s[end]))
while length(s) > M.m
popfirst!(s)
popfirst!(y)
popfirst!(rho)
end
end
end
@yegortk
Copy link
Author

yegortk commented May 30, 2022

To use L1Convex module

using HTTP: request

"https://gist.githubusercontent.com/yegortk/ce18975200e7dffd1759125972cd54f4/raw/4d4d4543c5b3266a891a67c780f3f2687c89a579/L1Convex.jl" |>
  url -> request("GET", url) |> 
  res -> String(res.body) |> 
  str -> include_string(Main, str)

using .L1Convex

@yegortk
Copy link
Author

yegortk commented May 30, 2022

### EXAMPLE ###

using .L1Convex

using Statistics
using Random
Random.seed!(111)

N = 100
X = randn(N, 3)
y = X * [2, -2, 0.0] .+ randn(N) .+ 0.1
X = hcat(X, ones(N)) # add intercept term

# convex function to minimize (here, MSE)
# note that this function should not include L1 penalty, OWL-QN algorithm applies L1 penalty during gradient update step
# parameters beta must be a 1D vector
# in this implementation, all elements of beta are L1 regularized (e.g., including intercept parameter)
function f(beta::Vector{Float64})
    diff = y - X * beta;
    sum(diff.^2.0) / size(y)[1]
end

# gradient function
function ∇f(beta::Vector{Float64})
    -2 .* X' * (y - X * beta) / size(y)[1]
end

# initialize parameters
beta = ones(size(X)[2])

# optimization with lambda = 0.2 regularization strength
lambda = 0.2
M = OWLQN(beta, lambda=lambda);

for i in 1:100    
    step!(M, f, ∇f, beta);
    
    mse = f(beta);
    nrm = sum(abs.(beta))
    loss = mse + lambda * nrm
    
    print(string("Iteration: ", i, " Loss: ", loss, " MSE: ", mse, "\n"))
end

# OWL-QN
print(string("OWL-QN L1-regularized solution: ", round.(beta, digits=3),"\n"))

# OLS
print(string("OLS solution: ", round.(inv(X' * X) * X' * y, digits=3),"\n"))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment