Dirk Hesse dhesse

## spar_1.py
# assume a spark context is given as sc
# and a spark sql context as
rdd = (sc.textFile('data.csv')
         .map(lambda x: x.split(';'))
         .map(lambda x: Row(name = x[0],
                            age = int(x[1]))))
df = spark.createDataFrame(rdd)

## FoodPriceNotebookImproved.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                dhesse
                / FoodPriceNotebookImproved.ipynb
            
            
              Created
              April 20, 2016 17:40
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## FoodPriceNotebook.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                dhesse
                / FoodPriceNotebook.ipynb
            
            
              Created
              April 18, 2016 18:31
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## readMongo.scala
import collection.JavaConversions._

import org.apache.hadoop.conf.Configuration
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd.RDD

import org.bson.BSONObject
import org.bson.types.BasicBSONList
import java.io._
import com.mongodb.hadoop.{

## greenville25.py
# you'll need the Greenville gist as well:
# https://gist.github.com/dhesse/aa2e2425548bf3e4ceb7
from greenville import compareVisuallyToBenford, data

indices = data.Amount >= 10
vc25 = data[indices][data.Amount[indices]\
  .apply(lambda x: int(str(x)[:2])) == 25]['Vendor Name'].value_counts()

print vc25[vc25 > 10]

## greenvilletwodigit.py
# you'll need the Greenville gist as well:
# https://gist.github.com/dhesse/aa2e2425548bf3e4ceb7
from greenville import compareVisuallyToBenford, data

doubleDigitCounts = data\
    .Amount[data.Amount >= 10]\
    .apply(lambda x: int(str(x)[:2]))\
    .value_counts()

compareVisuallyToBenford(doubleDigitCounts, 'Counts of First Two Digits')

## greenivlle.py
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('data/Greenville_County_School_District_Spending.csv',
                   converters={'Amount': lambda x: float(x.replace('$', ''))},
                   parse_dates=9)
singleDigitCounts = data\
  .Amount[data.Amount >= 1]\
  .apply(lambda x: int(str(x)[:1]))\
  .value_counts()


## getfriends.py
"""Simple demo script explaining how to extract twitter users'
friend lists using the python twitter package."""
import time
import twitter
api = twitter.Api(consumer_key='your_key',
                  consumer_secret='your_secret',
                  access_token_key='your_token_key',
                  access_token_secret='your_token_secret')
while not api.GetRateLimitStatus()\
                  ['resources']['friends']['/friends/ids']['remaining']:

## SparkStreamingTweetCountExample.py
from pymongo import MongoClient
from collections import defaultdict
from dateutil.parser import parse
from random import randint
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

class HourCounter(object):
    '''Saves hourly counts form Twitter messages to MongoDB.
    WARNING: This class is for illustration purposes only and will hammer

## goodFFS.py
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
def goodFFS(X, y, nFeatures):
    """Proper forward feature selection.

    Arguments:
    X -- matrix containing feature vectors
    y -- label data
    nFeatures -- maximum number of features
	# assume a spark context is given as sc
	# and a spark sql context as
	rdd = (sc.textFile('data.csv')
	.map(lambda x: x.split(';'))
	.map(lambda x: Row(name = x[0],
	age = int(x[1]))))
	df = spark.createDataFrame(rdd)
	import collection.JavaConversions._

	import org.apache.hadoop.conf.Configuration
	import org.apache.spark.{SparkContext, SparkConf}
	import org.apache.spark.rdd.RDD

	import org.bson.BSONObject
	import org.bson.types.BasicBSONList
	import java.io._
	import com.mongodb.hadoop.{
	# you'll need the Greenville gist as well:
	# https://gist.github.com/dhesse/aa2e2425548bf3e4ceb7
	from greenville import compareVisuallyToBenford, data

	indices = data.Amount >= 10
	vc25 = data[indices][data.Amount[indices]\
	.apply(lambda x: int(str(x)[:2])) == 25]['Vendor Name'].value_counts()

	print vc25[vc25 > 10]
	import pandas as pd
	import matplotlib.pyplot as plt
	data = pd.read_csv('data/Greenville_County_School_District_Spending.csv',
	converters={'Amount': lambda x: float(x.replace('$', ''))},
	parse_dates=9)
	singleDigitCounts = data\
	.Amount[data.Amount >= 1]\
	.apply(lambda x: int(str(x)[:1]))\
	.value_counts()
	"""Simple demo script explaining how to extract twitter users'
	friend lists using the python twitter package."""
	import time
	import twitter
	api = twitter.Api(consumer_key='your_key',
	consumer_secret='your_secret',
	access_token_key='your_token_key',
	access_token_secret='your_token_secret')
	while not api.GetRateLimitStatus()\
	['resources']['friends']['/friends/ids']['remaining']:
	from pymongo import MongoClient
	from collections import defaultdict
	from dateutil.parser import parse
	from random import randint
	from pyspark import SparkContext
	from pyspark.streaming import StreamingContext

	class HourCounter(object):
	'''Saves hourly counts form Twitter messages to MongoDB.
	WARNING: This class is for illustration purposes only and will hammer
	from sklearn.cross_validation import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import f1_score
	def goodFFS(X, y, nFeatures):
	"""Proper forward feature selection.

	Arguments:
	X -- matrix containing feature vectors
	y -- label data
	nFeatures -- maximum number of features