robie lee liwh

## avazu_ftrl_concurrent.go
// Based on tinrtgu's Python script here:
// https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory
package main

import (
    "encoding/csv"
    "os"
    "strconv"
    "hash/fnv"
    "math"

## press.one
https://press.one/p/address/v?s=2aa2051c648db6d09b4ce24be0bb00d278e15c6697b5602919f40a060083bb41d01415ab13403565bbc89a06edf25d6c5dd7b5d9ddeca73717ab6e04a3414e501&h=503c45393fbf9ecbefae56f94b5ee6157e1f1fc57cdf4fd6f72be55fe9a834a5&a=702e876f6bb7a019e13e94b2a86f167c9770a0b3&f=P1&v=2

## elasticsearch_best_practices.txt
If you want, I can try and help with pointers as to how to improve the indexing speed you get. Its quite easy to really increase it by using some simple guidelines, for example:

- Use create in the index API (assuming you can).
- Relax the real time aspect from 1 second to something a bit higher (index.engine.robin.refresh_interval).
- Increase the indexing buffer size (indices.memory.index_buffer_size), it defaults to the value 10% which is 10% of the heap.
- Increase the number of dirty operations that trigger automatic flush (so the translog won't get really big, even though its FS based) by setting index.translog.flush_threshold (defaults to 5000).
- Increase the memory allocated to elasticsearch node. By default its 1g.
- Start with a lower replica count (even 0), and then once the bulk loading is done, increate it to the value you want it to be using the update_settings API. This will improve things as possibly less shards will be allocated to each machine.
- Increase the number of machines you have so

## airflow.cfg
[core]
# The home folder for airflow, default is ~/airflow
airflow_home = /Users/p1nox/airflow

# The folder where your airflow pipelines live, most likely a
# subfolder in a code repository
dags_folder = /Users/p1nox/airflow/dags

# The folder where airflow should store its log files. This location
base_log_folder = /Users/p1nox/airflow/logs

## gist:c0c05e929aecc3e70abb
[uwsgi]
socket = /data/app/run/%n.sock
pidfile2 = /data/app/run/%n.pid
logto2 = /data/app/logs/uwsgi.log

logdate = true
log-format = [%(addr)] [%(ctime)] [%(method)] [%(uri)] [%(proto)] [%(status)] [%(msecs)] [%(referer)] [%(uagent)]

memory-report = true

## gist:9e0f7e765403a631ec28
88770

## access.lua
- certain endpoints are always blocked
if nginx_uri == "/_access_token" or nginx_uri == "/_me" then
    ngx.exit(403)
end

-- import requirements
local cjson = require "cjson"

-- setup some app-level vars
local app_id = "APP_ID"

## gist:d4cb15fe2d28808d15e6
290a4210ec26f55ddbf5dae952ad0c3c

## gist:9541682
package

## gist:9360280
==> Installing dependencies for python: readline, sqlite, gdbm
==> Installing python dependency: readline
==> Downloading https://downloads.sf.net/project/machomebrew/Bottles/readline-6.2.4.mavericks.bottle.2.tar.gz
######################################################################## 100.0%
==> Pouring readline-6.2.4.mavericks.bottle.2.tar.gz
==> Caveats
This formula is keg-only, so it was not symlinked into /usr/local.

OS X provides the BSD libedit library, which shadows libreadline.
In order to prevent conflicts when programs look for libreadline we are
	// Based on tinrtgu's Python script here:
	// https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory
	package main

	import (
	"encoding/csv"
	"os"
	"strconv"
	"hash/fnv"
	"math"
	If you want, I can try and help with pointers as to how to improve the indexing speed you get. Its quite easy to really increase it by using some simple guidelines, for example:

	- Use create in the index API (assuming you can).
	- Relax the real time aspect from 1 second to something a bit higher (index.engine.robin.refresh_interval).
	- Increase the indexing buffer size (indices.memory.index_buffer_size), it defaults to the value 10% which is 10% of the heap.
	- Increase the number of dirty operations that trigger automatic flush (so the translog won't get really big, even though its FS based) by setting index.translog.flush_threshold (defaults to 5000).
	- Increase the memory allocated to elasticsearch node. By default its 1g.
	- Start with a lower replica count (even 0), and then once the bulk loading is done, increate it to the value you want it to be using the update_settings API. This will improve things as possibly less shards will be allocated to each machine.
	- Increase the number of machines you have so
	[core]
	# The home folder for airflow, default is ~/airflow
	airflow_home = /Users/p1nox/airflow

	# The folder where your airflow pipelines live, most likely a
	# subfolder in a code repository
	dags_folder = /Users/p1nox/airflow/dags

	# The folder where airflow should store its log files. This location
	base_log_folder = /Users/p1nox/airflow/logs
	[uwsgi]
	socket = /data/app/run/%n.sock
	pidfile2 = /data/app/run/%n.pid
	logto2 = /data/app/logs/uwsgi.log

	logdate = true
	log-format = [%(addr)] [%(ctime)] [%(method)] [%(uri)] [%(proto)] [%(status)] [%(msecs)] [%(referer)] [%(uagent)]

	memory-report = true
	- certain endpoints are always blocked
	if nginx_uri == "/_access_token" or nginx_uri == "/_me" then
	ngx.exit(403)
	end

	-- import requirements
	local cjson = require "cjson"

	-- setup some app-level vars
	local app_id = "APP_ID"
	==> Installing dependencies for python: readline, sqlite, gdbm
	==> Installing python dependency: readline
	==> Downloading https://downloads.sf.net/project/machomebrew/Bottles/readline-6.2.4.mavericks.bottle.2.tar.gz
	######################################################################## 100.0%
	==> Pouring readline-6.2.4.mavericks.bottle.2.tar.gz
	==> Caveats
	This formula is keg-only, so it was not symlinked into /usr/local.

	OS X provides the BSD libedit library, which shadows libreadline.
	In order to prevent conflicts when programs look for libreadline we are