Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Kaggle Avazu Challenge: FTRL-Proximal with L1 & L2 implemented in Go (single-threaded)
// Based on tinrtgu's Python script here:
// https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory
package main
import (
"encoding/csv"
"os"
"strconv"
"hash/fnv"
"math"
"log"
"time"
)
// ###############################
// parameters
//################################
var train string
var test string
var submission string
var epoch int
var D uint32
type FTRL struct {
alpha, beta, L1, L2 float64
n map[uint32]float64 // squared sum of past gradients
z map[uint32]float64 // coefficients / weights
w map[uint32]float64 // tmp coefficients / weights
}
func (m *FTRL) predict(x []uint32) float64{
wTx := 0.0
for i := 0; i < len(x); i++{
z, ok := m.z[x[i]]
if ok == false{
m.z[x[i]] = 0
m.n[x[i]] = 0
m.w[x[i]] = 0
z = 0
}
sign := 1.0
if z < 0 {sign = -1.0}
if sign * z <= m.L1{
m.w[x[i]] = 0.
}else{
m.w[x[i]] = (sign * m.L1 - z) / ((m.beta + math.Sqrt(m.n[x[i]])) / m.alpha + m.L2)
}
wTx += m.w[x[i]]
}
return 1.0 / (1.0 + math.Exp(-math.Max(math.Min(wTx, 35.0), -35.0)))
}
func (m *FTRL) update(x []uint32, p, y float64) {
// gradient under logloss
g := p - y
// update z and n
for i := 0; i< len(x); i++ {
sigma := (math.Sqrt(m.n[x[i]] + g * g) - math.Sqrt(m.n[x[i]])) / m.alpha
m.z[x[i]] += g - sigma * m.w[x[i]]
m.n[x[i]] += g * g
}
}
func hash(s string) uint32 {
h := fnv.New32a()
h.Write([]byte(s))
return h.Sum32()
}
func nextRow(reader *csv.Reader, column_names map[string]int) (string, float64, int, []uint32){
row, err := reader.Read()
if err != nil {
return "", 0, 0, nil
}
features_n := len(row) - 1
ID := row[column_names["id"]]
//process clicks
y := 0.0
_, click := column_names["click"]
if click == true {
if row[column_names["click"]] == "1"{
y = 1.0
}
features_n -= 1
}
date, _ := strconv.Atoi(row[column_names["hour"]][4:6])
date -= 20
row[column_names["hour"]] = row[column_names["hour"]][6:]
features := make([]uint32, features_n)
count := 0
for i := 0; i < len(row); i++ {
if i != column_names["id"]{
if click == false || i != column_names["click"]{
features[count] = hash(strconv.Itoa(count) + "_" + row[i]) % D
count += 1
}
}
}
return ID, y, date, features
}
func logloss(p, y float64) float64{
p = math.Max(math.Min(p, 1.0 - 10e-15), 10e-15)
if y == 1. {
return -math.Log(p)
}else{
return -math.Log(1. - p)
}
}
func opencsv(filename string, create bool) *os.File{
var err error
var csvfile *os.File
if create{
csvfile, err = os.Create(filename)
}else{
csvfile, err = os.Open(filename)
}
if err != nil{
log.Fatal(err)
}
return csvfile
}
func main(){
//Set up parameters
D = 1 << 20
train = "head20"
test = "t20"
submission = "submission_go.csv"
holdout := 30
epoch = 2
start := time.Now()
model := FTRL{alpha: 0.15, beta: 1.1, L1: 1.1, L2:1.1,
n: make(map[uint32]float64), z: make(map[uint32]float64), w:make(map[uint32]float64)}
var trainfile *os.File
var reader *csv.Reader
var header []string
var elapsed time.Duration
var column_names map[string]int
for r := 0; r < epoch; r++ {
trainfile = opencsv(train, false)
reader= csv.NewReader(trainfile)
header, _ = reader.Read()
column_names = make(map[string]int)
for i, name := range header {
column_names[name] = i
}
count := 1
l_count := 0.0
loss := 0.0
for {
_, y, _, features := nextRow(reader, column_names)
if features == nil { break } // reach EOF
p := model.predict(features)
if count % holdout== 0 {
l_count += 1
loss += logloss(p, y)
if count % (holdout * 100000) == 0 {
log.Println(p, y, loss/l_count)
}
}
count += 1
model.update(features, p, y)
}
trainfile.Close()
elapsed = time.Since(start)
log.Printf("Epoch %d took %s logloss %f", r+1, elapsed, loss/l_count)
start = time.Now()
}
//Start testing
testfile := opencsv(test, false)
outfile := opencsv(submission, true)
reader = csv.NewReader(testfile)
writer := csv.NewWriter(outfile)
header, _ = reader.Read()
column_names = make(map[string]int)
for i, name := range header {
column_names[name] = i
}
writer.Write([]string{"id","click"}) // add header to the submission file
for{
ID, _, _, features := nextRow(reader, column_names)
if features == nil { break } // reach EOF
p := model.predict(features)
writer.Write([]string{ID, strconv.FormatFloat(p, 'f', -1, 64)})
}
writer.Flush()
testfile.Close()
outfile.Close()
elapsed = time.Since(start)
log.Printf("Testing took %s", elapsed)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment