Michael Erasmus michael-erasmus

## A motherfucking wordcloud.ipynb

      
              1 file
            
          
              1 fork
            
          
              0 comments
            
          
              0 stars
            
          
                michael-erasmus
                / A motherfucking wordcloud.ipynb
            
            
              Created
              December 17, 2015 05:02
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Slackcloud.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                michael-erasmus
                / Slackcloud.ipynb
            
            
              Created
              December 16, 2015 00:48
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Audit Views.ipynb

      
              1 file
            
          
              1 fork
            
          
              0 comments
            
          
              1 star
            
          
                michael-erasmus
                / Audit Views.ipynb
            
            
              Created
              December 11, 2015 23:53
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## clean_data.py
selected_words = [
    'receipt',
    'card',
    'refund',
    'month',
    'monthly',
    'plan',
    'profit',
    'charged',
    'charge',

## tf-idf-keywords.py
import re
import graphlab
#remove html tags
docs['words'] = docs['body'].apply(lambda doc: re.sub("<[^>]*>", "", doc))
#remove punctuation, whitespace and lowercase it all
docs['words'] = docs['words'].apply(lambda doc: re.sub("[\W\d]", " ", doc.lower().strip()))
docs = graphlab.SFrame(docs)
docs['word_counts'] = graphlab.text_analytics.count_words(docs['words'])
docs_tfidf = graphlab.text_analytics.tf_idf(docs['words'])
docs['top10'] = docs_tfidf['docs'].apply(lambda t: " ".join(sorted(t, key=t.get, reverse=True)[1:10]))

## experiment_result_p_value.sql
create or replace function experiment_result_p_value(control_size float, control_conversion float, experiment_size float, experiment_conversion float)
	returns float

stable
as $$
	from scipy.stats import chi2_contingency
	from numpy import array
	observed = array([
		[control_size - control_conversion, control_conversion],
       [experiment_size - experiment_conversion, experiment_conversion]

## tf-idf.py
import os
import math
import re
import pandas as pd
from collections import Counter
from sklearn.datasets import fetch_20newsgroups

#get a subset of the dataset

categories = [

## LDA_SparkDocs
/*
This example uses Scala.  Please see the MLlib documentation for a Java example.

Try running this code in the Spark shell.  It may produce different topics each time (since LDA includes some randomization), but it should give topics similar to those listed above.

This example is paired with a blog post on LDA in Spark: http://databricks.com/blog
Spark: http://spark.apache.org/
*/

import scala.collection.mutable

## actions_taken_to_redshift.py
import luigi
from buffer_redshift_etl import *

class ExtractActionsTakenTask(BufferRedshiftETLExtractTask):
    def etl_name(self):
        return 'actions-taken'

class TransformActionsTakenTask(BufferRedshiftETLTransformTask):
    def etl_name(self):
        return 'actions-taken'

## transform_actions_taken.pig
REGISTER '../udfs/jython/actions_taken.py' USING jython AS actions_taken;
REGISTER '../udfs/python/actions_taken.py' USING streaming_python AS actions_taken1;

raw =  load '$OUTPUT_PATH/extract-actions-taken'
    using PigStorage()
        as (
            user_id:chararray,
            visitor_id:chararray,
            client_id:chararray,
            last_modified:chararray,
	selected_words = [
	'receipt',
	'card',
	'refund',
	'month',
	'monthly',
	'plan',
	'profit',
	'charged',
	'charge',
	import re
	import graphlab
	#remove html tags
	docs['words'] = docs['body'].apply(lambda doc: re.sub("<[^>]*>", "", doc))
	#remove punctuation, whitespace and lowercase it all
	docs['words'] = docs['words'].apply(lambda doc: re.sub("[\W\d]", " ", doc.lower().strip()))
	docs = graphlab.SFrame(docs)
	docs['word_counts'] = graphlab.text_analytics.count_words(docs['words'])
	docs_tfidf = graphlab.text_analytics.tf_idf(docs['words'])
	docs['top10'] = docs_tfidf['docs'].apply(lambda t: " ".join(sorted(t, key=t.get, reverse=True)[1:10]))
	create or replace function experiment_result_p_value(control_size float, control_conversion float, experiment_size float, experiment_conversion float)
	returns float

	stable
	as $$
	from scipy.stats import chi2_contingency
	from numpy import array
	observed = array([
	[control_size - control_conversion, control_conversion],
	[experiment_size - experiment_conversion, experiment_conversion]
	import os
	import math
	import re
	import pandas as pd
	from collections import Counter
	from sklearn.datasets import fetch_20newsgroups

	#get a subset of the dataset

	categories = [
	/*
	This example uses Scala. Please see the MLlib documentation for a Java example.

	Try running this code in the Spark shell. It may produce different topics each time (since LDA includes some randomization), but it should give topics similar to those listed above.

	This example is paired with a blog post on LDA in Spark: http://databricks.com/blog
	Spark: http://spark.apache.org/
	*/

	import scala.collection.mutable
	import luigi
	from buffer_redshift_etl import *

	class ExtractActionsTakenTask(BufferRedshiftETLExtractTask):
	def etl_name(self):
	return 'actions-taken'

	class TransformActionsTakenTask(BufferRedshiftETLTransformTask):
	def etl_name(self):
	return 'actions-taken'
	REGISTER '../udfs/jython/actions_taken.py' USING jython AS actions_taken;
	REGISTER '../udfs/python/actions_taken.py' USING streaming_python AS actions_taken1;

	raw = load '$OUTPUT_PATH/extract-actions-taken'
	using PigStorage()
	as (
	user_id:chararray,
	visitor_id:chararray,
	client_id:chararray,
	last_modified:chararray,