Tong tonglu

## RandomizedLogisticRegression.py
from sklearn.datasets import make_classification
from sklearn.linear_model import RandomizedLogisticRegression
import pandas as pd

clf = RandomizedLogisticRegression(sample_fraction = .75, n_resampling = 3, normalize = False)

X, y = make_classification(n_samples=10,
                           n_features=3,
                           n_informative=2,
                           n_redundant=0,

## clear_image_cache_on_github
curl -X PURGE

## desubmodulize.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                tonglu
                / desubmodulize.md
            
            
              Last active
              September 6, 2016 22:06
            
          
one has to get a new fork from dmlc/xgboost like this one


because of the use of submodules, devtools doesn't work anymore with installing the R-packages. One has to remove the submodules in the following manner:


http://stackoverflow.com/questions/1030169/easy-way-pull-latest-of-all-submodules
https://gist.github.com/kyleturner/1563153


bring back the necessary customizations that we've made here 


test the new repo with avant-analytics and run("demo_xgb_R")


## gist:497e871a92dc9585432fcbcf82f56a50
 for i in `ls .`; do;
md5 $i | cut -d' ' -f4 > $i.md5
done;

lapply(list.files(pattern = "md5$"), readLines)

## edit_function_in_R_console.R
body(mod$predict_fn) <- edit(body(mod$predict_fn))

## fast_split.R
v <- sample(1:1000, 50000000, replace = T)
df <- data.frame(id = v)
# split(df, df$id)) is SLOW because it's O(n^2)
# sort(df$id) is FAST because it's O(n ln n)
sid <- sort(df$id)
rle(sid) -> rlesid # Fast because O(n)
tally <- 1
values <- numeric(length(rlesid$lengths)); indices <- numeric(length(rlesid$lengths))
for (i in seq_along(indices)) {
  indices[i] <- tally

## install java
sudo apt-get install -y openjdk-7-jre-headless
sudo R CMD javareconf

## test_new_pkg_with_lockbox
name: yourpkg
  version: 0.1.0
  remote: local
  dir: /your/pkg

add this to your lockfile and comment out the part that installs from github

## regex.r
a|b|c - means contains the strings "a" or "b" or "c"
^(a|b|c)$ - means IS the string 'a' or 'b' or 'c' same as:  %in% c('a','b','c')
^(a|b|c) - means begins with 'a' or 'b' or 'c'
(a|b|c)$ - means ends with 'a' or 'b' or 'c'

^[a-zA-Z0-9]*$ - means IS the string contains only a-z, A-Z or 0-9

## vim_tips
:%s/tu_final_prediction_data/env$data/g  #massive string replacement
	from sklearn.datasets import make_classification
	from sklearn.linear_model import RandomizedLogisticRegression
	import pandas as pd

	clf = RandomizedLogisticRegression(sample_fraction = .75, n_resampling = 3, normalize = False)

	X, y = make_classification(n_samples=10,
	n_features=3,
	n_informative=2,
	n_redundant=0,
	for i in `ls .`; do;
	md5 $i \| cut -d' ' -f4 > $i.md5
	done;

	lapply(list.files(pattern = "md5$"), readLines)
	v <- sample(1:1000, 50000000, replace = T)
	df <- data.frame(id = v)
	# split(df, df$id)) is SLOW because it's O(n^2)
	# sort(df$id) is FAST because it's O(n ln n)
	sid <- sort(df$id)
	rle(sid) -> rlesid # Fast because O(n)
	tally <- 1
	values <- numeric(length(rlesid$lengths)); indices <- numeric(length(rlesid$lengths))
	for (i in seq_along(indices)) {
	indices[i] <- tally
	sudo apt-get install -y openjdk-7-jre-headless
	sudo R CMD javareconf
	name: yourpkg
	version: 0.1.0
	remote: local
	dir: /your/pkg

	add this to your lockfile and comment out the part that installs from github
	a\|b\|c - means contains the strings "a" or "b" or "c"
	^(a\|b\|c)$ - means IS the string 'a' or 'b' or 'c' same as: %in% c('a','b','c')
	^(a\|b\|c) - means begins with 'a' or 'b' or 'c'
	(a\|b\|c)$ - means ends with 'a' or 'b' or 'c'

	^[a-zA-Z0-9]*$ - means IS the string contains only a-z, A-Z or 0-9