Skip to content

Instantly share code, notes, and snippets.

View statcompute's full-sized avatar

WenSui Liu statcompute

  • San Antonio, TX
View GitHub Profile
@statcompute
statcompute / xframes_example.py
Last active November 24, 2018 06:08
An example for xframes
from xframes import XFrame, aggregate
df = XFrame.read_csv("Downloads/nycflights.csv", header = True, nrows = 11)
### SUBSETTING
sel_cols = ["origin", "dest", "distance", "dep_delay", "carrier"]
df2 = df[sel_cols]
# OR:
# df.sql("select " + ", ".join(sel_cols) + " from df")
@statcompute
statcompute / list_vs_queue.r
Last active November 24, 2018 06:31
Growing List vs Growing Queue
### GROWING LIST ###
base_lst1 <- function(df) {
l <- list()
for (i in seq(nrow(df))) l[[i]] <- as.list(df[i, ])
return(l)
}
### PRE-ALLOCATING LIST ###
base_lst2 <- function(df) {
l <- vector(mode = "list", length = nrow(df))
@statcompute
statcompute / df2dictlist.r
Created November 24, 2018 06:31
Converting data frame to dictionary list
### LIST() FUNCTION IN BASE PACKAGE ###
x1 <- as.list(iris[1, ])
names(x1)
# [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
x1[["Sepal.Length"]]
# [1] 5.1
### ENVIRONMENT-BASED SOLUTION ###
envn_dict <- function(x) {
e <- new.env(hash = TRUE)
@statcompute
statcompute / sas2lua.sas
Created November 24, 2018 06:35
Fetch sas data set to lua table
data one;
array c{2} $ _temporary_ ("A", "B");
do i = 1 to dim(c);
x = c[i];
do j = 1 to 2;
y = round(rannor(1), 0.0001);
output;
end;
end;
run;
@statcompute
statcompute / astropy_example.py
Created November 24, 2018 06:38
Data wrangling with astropy
from astropy.io import ascii
from astropy.table import Table, join
from numpy import nanmean, nanmedian, array, sort
tbl1 = ascii.read("Downloads/nycflights.csv", format = "csv")
### SUBSETTING
sel_cols = ["origin", "dest", "distance", "dep_delay", "carrier"]
@statcompute
statcompute / dataset_example.py
Created November 24, 2018 06:43
Manipulating dictionary list with SQLite back-end
from astropy.io.ascii import read
selected = ["origin", "dep_delay", "distance"]
csv = read("Downloads/nycflights.csv", format = 'csv', data_end = 11)[selected]
lst = map(lambda x: dict(zip(x.colnames, x)), csv)
from dataset import connect
@statcompute
statcompute / isoreg_bin.r
Created November 24, 2018 21:23
Monotonic Binning Based on Isotonic Regression
isoreg_bin <- function(data, y, x) {
n1 <- 50
n2 <- 10
yname <- deparse(substitute(y))
xname <- deparse(substitute(x))
df1 <- data[, c(yname, xname)]
df2 <- df1[!is.na(df1[, xname]), c(xname, yname)]
cor <- cor(df2[, 2], df2[, 1], method = "spearman", use = "complete.obs")
reg <- isoreg(df2[, 1], cor / abs(cor) * df2[, 2])
cut <- knots(as.stepfun(reg))
@statcompute
statcompute / bump_bin.r
Last active November 25, 2018 18:44
use bumping to improve monotonic binning
bump_bin <- function(data, y, x, n) {
n1 <- 50
n2 <- 10
set.seed(2019)
seeds <- c(0, round(runif(n) * as.numeric(paste('1e', ceiling(log10(n)) + 2, sep = '')), 0))
yname <- deparse(substitute(y))
xname <- deparse(substitute(x))
df1 <- data[, c(yname, xname)]
df2 <- df1[!is.na(df1[, xname]), c(xname, yname)]
cor <- cor(df2[, 2], df2[, 1], method = "spearman", use = "complete.obs")
@statcompute
statcompute / dropout_dnn.py
Created November 25, 2018 15:11
Dropout Regularization in Deep Neural Networks
from pandas import read_csv, DataFrame
from numpy.random import seed
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.models import Sequential
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.layers import Dense, Dropout
from multiprocessing import Pool, cpu_count
@statcompute
statcompute / autoencoder_example.py
Created November 25, 2018 15:14
Autoencoder for Dimensionality Reduction
from pandas import read_csv, DataFrame
from numpy.random import seed
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
from keras.layers import Input, Dense
from keras.models import Model
df = read_csv("credit_count.txt")
Y = df[df.CARDHLDR == 1].DEFAULTS
X = df[df.CARDHLDR == 1].ix[:, 2:12]