Skip to content

Instantly share code, notes, and snippets.

View michael-erasmus's full-sized avatar

Michael Erasmus michael-erasmus

View GitHub Profile
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@michael-erasmus
michael-erasmus / clean_data.py
Created November 3, 2015 00:07
Simple logistic model
selected_words = [
'receipt',
'card',
'refund',
'month',
'monthly',
'plan',
'profit',
'charged',
'charge',
@michael-erasmus
michael-erasmus / tf-idf-keywords.py
Created November 2, 2015 23:56
TF-IDF keywords using Graphlab
import re
import graphlab
#remove html tags
docs['words'] = docs['body'].apply(lambda doc: re.sub("<[^>]*>", "", doc))
#remove punctuation, whitespace and lowercase it all
docs['words'] = docs['words'].apply(lambda doc: re.sub("[\W\d]", " ", doc.lower().strip()))
docs = graphlab.SFrame(docs)
docs['word_counts'] = graphlab.text_analytics.count_words(docs['words'])
docs_tfidf = graphlab.text_analytics.tf_idf(docs['words'])
docs['top10'] = docs_tfidf['docs'].apply(lambda t: " ".join(sorted(t, key=t.get, reverse=True)[1:10]))
@michael-erasmus
michael-erasmus / experiment_result_p_value.sql
Last active September 28, 2015 15:08
Statistical significance UDF in Redshift
create or replace function experiment_result_p_value(control_size float, control_conversion float, experiment_size float, experiment_conversion float)
returns float
stable
as $$
from scipy.stats import chi2_contingency
from numpy import array
observed = array([
[control_size - control_conversion, control_conversion],
[experiment_size - experiment_conversion, experiment_conversion]
@michael-erasmus
michael-erasmus / tf-idf.py
Created September 24, 2015 20:38
Tf-idf example
import os
import math
import re
import pandas as pd
from collections import Counter
from sklearn.datasets import fetch_20newsgroups
#get a subset of the dataset
categories = [
/*
This example uses Scala. Please see the MLlib documentation for a Java example.
Try running this code in the Spark shell. It may produce different topics each time (since LDA includes some randomization), but it should give topics similar to those listed above.
This example is paired with a blog post on LDA in Spark: http://databricks.com/blog
Spark: http://spark.apache.org/
*/
import scala.collection.mutable
@michael-erasmus
michael-erasmus / actions_taken_to_redshift.py
Created October 30, 2014 14:52
actions_taken_to_redshift.py
import luigi
from buffer_redshift_etl import *
class ExtractActionsTakenTask(BufferRedshiftETLExtractTask):
def etl_name(self):
return 'actions-taken'
class TransformActionsTakenTask(BufferRedshiftETLTransformTask):
def etl_name(self):
return 'actions-taken'
@michael-erasmus
michael-erasmus / transform_actions_taken.pig
Created October 30, 2014 14:10
transform_actions_taken.pig
REGISTER '../udfs/jython/actions_taken.py' USING jython AS actions_taken;
REGISTER '../udfs/python/actions_taken.py' USING streaming_python AS actions_taken1;
raw = load '$OUTPUT_PATH/extract-actions-taken'
using PigStorage()
as (
user_id:chararray,
visitor_id:chararray,
client_id:chararray,
last_modified:chararray,