Skip to content

Instantly share code, notes, and snippets.

View zachguo's full-sized avatar

Zach Guo zachguo

View GitHub Profile
@zachguo
zachguo / lr_feature_selection.py
Created February 24, 2015 23:03
LR feature selection and show support for selected features
def feature_selection(X_train, y_train, X_test):
feature_selector = LogisticRegression(C=0.1, penalty="l1", dual=False)
X_train = feature_selector.fit_transform(X_train, y_train)
X_test = feature_selector.transform(X_test)
return X_train, X_test, feature_selector
def get_support(feature_selector):
return list(set(np.where(feature_selector.coef_ != 0)[-1]))
@zachguo
zachguo / d3test.html
Created December 17, 2014 00:38
Meteor JS: Reactive D3 Force Layout Graph (minimum example)
<head>
<title>d3test</title>
</head>
<body>
<h3>Meteor JS: Reactive D3 Force Layout Graph (minimum example)</h3>
<h4>Add few nodes and links first, or visualization won't show up.</h4>
<span>New Node</span>
<form id="newnode">
@zachguo
zachguo / get_topic_features
Created July 22, 2014 22:18
Derive topic features from a text pandas series
import pandas as pd
from gensim import corpora, models
def get_topic_features(col):
"""Derive topic features from a text pandas series"""
# generate topics for corpora
colname = col.name
col = col.astype(str).apply(lambda x:x.split())
dictionary = corpora.Dictionary(col)
corpus = [dictionary.doc2bow(text) for text in col]
@zachguo
zachguo / states
Created July 11, 2014 06:33
internet use data for states
# internet_rate is "Households with no internet use in and outside the home in 2010", gathered from http://www.ntia.doc.gov/files/ntia/data/CPS2010Tables/t11_2.txt
STATES = {"AL":{"full_name":"ALABAMA",
"geo_region":"South",
"internet_rate":25.82},
"AK":{"full_name":"ALASKA",
"geo_region":"West",
"internet_rate":11.36},
"AS":{"full_name":"AMERICAN SAMOA",
"geo_region":"Outer",
"internet_rate":None},
@zachguo
zachguo / print_cm.py
Last active May 31, 2022 17:39
Pretty print for sklearn confusion matrix
from sklearn.metrics import confusion_matrix
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
"""pretty print for confusion matrixes"""
columnwidth = max([len(x) for x in labels]+[5]) # 5 is value length
empty_cell = " " * columnwidth
# Print header
print " " + empty_cell,
for label in labels:
print "%{0}s".format(columnwidth) % label,
@zachguo
zachguo / OLS.py
Created March 2, 2014 06:34
ordinary linear regression & print full results
import pandas
import statsmodels.api as sm
import numpy as np
def print_full(x):
pandas.set_option('display.max_rows', len(x))
print(x)
pandas.reset_option('display.max_rows')
dataframe = pandas.read_csv("turnstile_data_master_with_weather.csv")
a
a's
a’s
able
about
above
according
accordingly
across
actually
# "terminal-notifier" should be installed first
notify <- function(msg="Operation complete") {
in.osx <- (Sys.info()['sysname'] == "Darwin")
in.rstudio <- (Sys.getenv("RSTUDIO") == "1")
in.rgui <- (Sys.getenv("R_GUI_APP_REVISION") != "")
if (in.rstudio) { # hack to see if running in RStudio
title <- "RStudio"