ceshine/avazu_ftrl.go

## avazu_ftrl.go
// Based on tinrtgu's Python script here:
// https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory
package main

import (
    "encoding/csv"
    "os"
    "strconv"
    "hash/fnv"
    "math"
    "log"
    "time"
)

// ###############################
// parameters
//################################

var train string
var test string
var submission string

var epoch int
var D uint32

type FTRL struct {
    alpha, beta, L1, L2 float64
    n map[uint32]float64 // squared sum of past gradients
    z map[uint32]float64 // coefficients / weights
    w map[uint32]float64 // tmp coefficients / weights
}

func (m *FTRL) predict(x []uint32) float64{
    wTx := 0.0
    for i := 0; i < len(x); i++{
        z, ok := m.z[x[i]]
        if ok == false{
            m.z[x[i]] = 0
            m.n[x[i]] = 0
            m.w[x[i]] = 0
            z = 0
        }
        sign := 1.0
        if z < 0 {sign = -1.0}
        if sign * z <= m.L1{
            m.w[x[i]] = 0.
        }else{
            m.w[x[i]] = (sign * m.L1 - z) / ((m.beta + math.Sqrt(m.n[x[i]])) / m.alpha + m.L2)
        }
        wTx += m.w[x[i]]
    }
    return 1.0 / (1.0 + math.Exp(-math.Max(math.Min(wTx, 35.0), -35.0)))
}

func (m *FTRL) update(x []uint32, p, y float64) {
    // gradient under logloss
    g := p - y
    // update z and n
    for i := 0; i< len(x); i++ {
        sigma := (math.Sqrt(m.n[x[i]] + g * g) - math.Sqrt(m.n[x[i]])) / m.alpha
        m.z[x[i]] += g - sigma * m.w[x[i]]
        m.n[x[i]] += g * g
    }
}

func hash(s string) uint32 {
    h := fnv.New32a()
    h.Write([]byte(s))
    return h.Sum32()
}

func nextRow(reader *csv.Reader, column_names map[string]int) (string, float64, int, []uint32){
    row, err := reader.Read()
    if err != nil {
        return "", 0, 0, nil
    }

    features_n := len(row) - 1

    ID := row[column_names["id"]]

    //process clicks
    y := 0.0
    _, click := column_names["click"]
    if click == true {
        if row[column_names["click"]] == "1"{
                y = 1.0
        }
        features_n -= 1
    }
    date, _ := strconv.Atoi(row[column_names["hour"]][4:6])
    date -= 20

    row[column_names["hour"]] = row[column_names["hour"]][6:]

    features := make([]uint32, features_n)
    count := 0
    for i := 0; i < len(row); i++ {
        if i != column_names["id"]{
            if click == false || i != column_names["click"]{
                features[count] = hash(strconv.Itoa(count) + "_" + row[i]) % D
                count += 1
            }
        }
    }

    return ID, y, date, features
}

func logloss(p, y float64) float64{
    p = math.Max(math.Min(p, 1.0 - 10e-15), 10e-15)
    if y == 1. {
        return -math.Log(p)
    }else{
        return -math.Log(1. - p)
    }
}

func opencsv(filename string, create bool) *os.File{
    var err error
    var csvfile *os.File
    if create{
        csvfile, err = os.Create(filename)
    }else{
        csvfile, err = os.Open(filename)
    }
    if err != nil{
        log.Fatal(err)
    }
    return csvfile
}

func main(){
    //Set up parameters
    D = 1 << 20
    train = "head20"
    test = "t20"
    submission = "submission_go.csv"
    holdout := 30
    epoch = 2

    start := time.Now()

    model := FTRL{alpha: 0.15, beta: 1.1, L1: 1.1, L2:1.1,
        n: make(map[uint32]float64), z: make(map[uint32]float64), w:make(map[uint32]float64)}

    var trainfile *os.File
    var reader *csv.Reader
    var header []string
    var elapsed time.Duration
    var column_names map[string]int

    for r := 0; r < epoch; r++ {
        trainfile = opencsv(train, false)
        reader= csv.NewReader(trainfile)
        header, _ = reader.Read()
        column_names = make(map[string]int)
        for i, name := range header {
            column_names[name] = i
        }
        count := 1
        l_count := 0.0
        loss := 0.0

        for {
            _, y, _, features := nextRow(reader, column_names)
            if features == nil { break } // reach EOF
            p := model.predict(features)
            if count % holdout== 0 {
                l_count += 1
                loss += logloss(p, y)
                if count % (holdout * 100000) == 0 {
                    log.Println(p, y, loss/l_count)
                }
            }
            count += 1
            model.update(features, p, y)
        }
        trainfile.Close()
        elapsed = time.Since(start)
        log.Printf("Epoch %d took %s logloss %f", r+1, elapsed, loss/l_count)
        start = time.Now()
    }

    //Start testing
    testfile := opencsv(test, false)
    outfile := opencsv(submission, true)
    reader = csv.NewReader(testfile)
    writer := csv.NewWriter(outfile)
    header, _ = reader.Read()
    column_names = make(map[string]int)
    for i, name := range header {
        column_names[name] = i
    }
    writer.Write([]string{"id","click"}) // add header to the submission file
    for{
        ID, _, _, features := nextRow(reader, column_names)
        if features == nil { break } // reach EOF
        p := model.predict(features)
        writer.Write([]string{ID, strconv.FormatFloat(p, 'f', -1, 64)})
    }
    writer.Flush()
    testfile.Close()
    outfile.Close()
    elapsed = time.Since(start)
    log.Printf("Testing took %s", elapsed)
}
	// Based on tinrtgu's Python script here:
	// https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory
	package main

	import (
	"encoding/csv"
	"os"
	"strconv"
	"hash/fnv"
	"math"
	"log"
	"time"
	)

	// ###############################
	// parameters
	//################################

	var train string
	var test string
	var submission string

	var epoch int
	var D uint32

	type FTRL struct {
	alpha, beta, L1, L2 float64
	n map[uint32]float64 // squared sum of past gradients
	z map[uint32]float64 // coefficients / weights
	w map[uint32]float64 // tmp coefficients / weights
	}

	func (m *FTRL) predict(x []uint32) float64{
	wTx := 0.0
	for i := 0; i < len(x); i++{
	z, ok := m.z[x[i]]
	if ok == false{
	m.z[x[i]] = 0
	m.n[x[i]] = 0
	m.w[x[i]] = 0
	z = 0
	}
	sign := 1.0
	if z < 0 {sign = -1.0}
	if sign * z <= m.L1{
	m.w[x[i]] = 0.
	}else{
	m.w[x[i]] = (sign * m.L1 - z) / ((m.beta + math.Sqrt(m.n[x[i]])) / m.alpha + m.L2)
	}
	wTx += m.w[x[i]]
	}
	return 1.0 / (1.0 + math.Exp(-math.Max(math.Min(wTx, 35.0), -35.0)))
	}

	func (m *FTRL) update(x []uint32, p, y float64) {
	// gradient under logloss
	g := p - y
	// update z and n
	for i := 0; i< len(x); i++ {
	sigma := (math.Sqrt(m.n[x[i]] + g * g) - math.Sqrt(m.n[x[i]])) / m.alpha
	m.z[x[i]] += g - sigma * m.w[x[i]]
	m.n[x[i]] += g * g
	}
	}

	func hash(s string) uint32 {
	h := fnv.New32a()
	h.Write([]byte(s))
	return h.Sum32()
	}

	func nextRow(reader *csv.Reader, column_names map[string]int) (string, float64, int, []uint32){
	row, err := reader.Read()
	if err != nil {
	return "", 0, 0, nil
	}

	features_n := len(row) - 1

	ID := row[column_names["id"]]

	//process clicks
	y := 0.0
	_, click := column_names["click"]
	if click == true {
	if row[column_names["click"]] == "1"{
	y = 1.0
	}
	features_n -= 1
	}
	date, _ := strconv.Atoi(row[column_names["hour"]][4:6])
	date -= 20

	row[column_names["hour"]] = row[column_names["hour"]][6:]

	features := make([]uint32, features_n)
	count := 0
	for i := 0; i < len(row); i++ {
	if i != column_names["id"]{
	if click == false \|\| i != column_names["click"]{
	features[count] = hash(strconv.Itoa(count) + "_" + row[i]) % D
	count += 1
	}
	}
	}

	return ID, y, date, features
	}

	func logloss(p, y float64) float64{
	p = math.Max(math.Min(p, 1.0 - 10e-15), 10e-15)
	if y == 1. {
	return -math.Log(p)
	}else{
	return -math.Log(1. - p)
	}
	}

	func opencsv(filename string, create bool) *os.File{
	var err error
	var csvfile *os.File
	if create{
	csvfile, err = os.Create(filename)
	}else{
	csvfile, err = os.Open(filename)
	}
	if err != nil{
	log.Fatal(err)
	}
	return csvfile
	}

	func main(){
	//Set up parameters
	D = 1 << 20
	train = "head20"
	test = "t20"
	submission = "submission_go.csv"
	holdout := 30
	epoch = 2

	start := time.Now()

	model := FTRL{alpha: 0.15, beta: 1.1, L1: 1.1, L2:1.1,
	n: make(map[uint32]float64), z: make(map[uint32]float64), w:make(map[uint32]float64)}

	var trainfile *os.File
	var reader *csv.Reader
	var header []string
	var elapsed time.Duration
	var column_names map[string]int

	for r := 0; r < epoch; r++ {
	trainfile = opencsv(train, false)
	reader= csv.NewReader(trainfile)
	header, _ = reader.Read()
	column_names = make(map[string]int)
	for i, name := range header {
	column_names[name] = i
	}
	count := 1
	l_count := 0.0
	loss := 0.0

	for {
	_, y, _, features := nextRow(reader, column_names)
	if features == nil { break } // reach EOF
	p := model.predict(features)
	if count % holdout== 0 {
	l_count += 1
	loss += logloss(p, y)
	if count % (holdout * 100000) == 0 {
	log.Println(p, y, loss/l_count)
	}
	}
	count += 1
	model.update(features, p, y)
	}
	trainfile.Close()
	elapsed = time.Since(start)
	log.Printf("Epoch %d took %s logloss %f", r+1, elapsed, loss/l_count)
	start = time.Now()
	}

	//Start testing
	testfile := opencsv(test, false)
	outfile := opencsv(submission, true)
	reader = csv.NewReader(testfile)
	writer := csv.NewWriter(outfile)
	header, _ = reader.Read()
	column_names = make(map[string]int)
	for i, name := range header {
	column_names[name] = i
	}
	writer.Write([]string{"id","click"}) // add header to the submission file
	for{
	ID, _, _, features := nextRow(reader, column_names)
	if features == nil { break } // reach EOF
	p := model.predict(features)
	writer.Write([]string{ID, strconv.FormatFloat(p, 'f', -1, 64)})
	}
	writer.Flush()
	testfile.Close()
	outfile.Close()
	elapsed = time.Since(start)
	log.Printf("Testing took %s", elapsed)
	}