Skip to content

Instantly share code, notes, and snippets.

# assume a spark context is given as sc
# and a spark sql context as
rdd = (sc.textFile('data.csv')
.map(lambda x: x.split(';'))
.map(lambda x: Row(name = x[0],
age = int(x[1]))))
df = spark.createDataFrame(rdd)
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import collection.JavaConversions._
import org.apache.hadoop.conf.Configuration
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd.RDD
import org.bson.BSONObject
import org.bson.types.BasicBSONList
import java.io._
import com.mongodb.hadoop.{
# you'll need the Greenville gist as well:
# https://gist.github.com/dhesse/aa2e2425548bf3e4ceb7
from greenville import compareVisuallyToBenford, data
indices = data.Amount >= 10
vc25 = data[indices][data.Amount[indices]\
.apply(lambda x: int(str(x)[:2])) == 25]['Vendor Name'].value_counts()
print vc25[vc25 > 10]
# you'll need the Greenville gist as well:
# https://gist.github.com/dhesse/aa2e2425548bf3e4ceb7
from greenville import compareVisuallyToBenford, data
doubleDigitCounts = data\
.Amount[data.Amount >= 10]\
.apply(lambda x: int(str(x)[:2]))\
.value_counts()
compareVisuallyToBenford(doubleDigitCounts, 'Counts of First Two Digits')
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('data/Greenville_County_School_District_Spending.csv',
converters={'Amount': lambda x: float(x.replace('$', ''))},
parse_dates=9)
singleDigitCounts = data\
.Amount[data.Amount >= 1]\
.apply(lambda x: int(str(x)[:1]))\
.value_counts()
@dhesse
dhesse / getfriends.py
Last active September 16, 2015 13:50
"""Simple demo script explaining how to extract twitter users'
friend lists using the python twitter package."""
import time
import twitter
api = twitter.Api(consumer_key='your_key',
consumer_secret='your_secret',
access_token_key='your_token_key',
access_token_secret='your_token_secret')
while not api.GetRateLimitStatus()\
['resources']['friends']['/friends/ids']['remaining']:
from pymongo import MongoClient
from collections import defaultdict
from dateutil.parser import parse
from random import randint
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
class HourCounter(object):
'''Saves hourly counts form Twitter messages to MongoDB.
WARNING: This class is for illustration purposes only and will hammer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
def goodFFS(X, y, nFeatures):
"""Proper forward feature selection.
Arguments:
X -- matrix containing feature vectors
y -- label data
nFeatures -- maximum number of features