This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# assume a spark context is given as sc | |
# and a spark sql context as | |
rdd = (sc.textFile('data.csv') | |
.map(lambda x: x.split(';')) | |
.map(lambda x: Row(name = x[0], | |
age = int(x[1])))) | |
df = spark.createDataFrame(rdd) |
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collection.JavaConversions._ | |
import org.apache.hadoop.conf.Configuration | |
import org.apache.spark.{SparkContext, SparkConf} | |
import org.apache.spark.rdd.RDD | |
import org.bson.BSONObject | |
import org.bson.types.BasicBSONList | |
import java.io._ | |
import com.mongodb.hadoop.{ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# you'll need the Greenville gist as well: | |
# https://gist.github.com/dhesse/aa2e2425548bf3e4ceb7 | |
from greenville import compareVisuallyToBenford, data | |
indices = data.Amount >= 10 | |
vc25 = data[indices][data.Amount[indices]\ | |
.apply(lambda x: int(str(x)[:2])) == 25]['Vendor Name'].value_counts() | |
print vc25[vc25 > 10] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# you'll need the Greenville gist as well: | |
# https://gist.github.com/dhesse/aa2e2425548bf3e4ceb7 | |
from greenville import compareVisuallyToBenford, data | |
doubleDigitCounts = data\ | |
.Amount[data.Amount >= 10]\ | |
.apply(lambda x: int(str(x)[:2]))\ | |
.value_counts() | |
compareVisuallyToBenford(doubleDigitCounts, 'Counts of First Two Digits') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import matplotlib.pyplot as plt | |
data = pd.read_csv('data/Greenville_County_School_District_Spending.csv', | |
converters={'Amount': lambda x: float(x.replace('$', ''))}, | |
parse_dates=9) | |
singleDigitCounts = data\ | |
.Amount[data.Amount >= 1]\ | |
.apply(lambda x: int(str(x)[:1]))\ | |
.value_counts() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Simple demo script explaining how to extract twitter users' | |
friend lists using the python twitter package.""" | |
import time | |
import twitter | |
api = twitter.Api(consumer_key='your_key', | |
consumer_secret='your_secret', | |
access_token_key='your_token_key', | |
access_token_secret='your_token_secret') | |
while not api.GetRateLimitStatus()\ | |
['resources']['friends']['/friends/ids']['remaining']: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pymongo import MongoClient | |
from collections import defaultdict | |
from dateutil.parser import parse | |
from random import randint | |
from pyspark import SparkContext | |
from pyspark.streaming import StreamingContext | |
class HourCounter(object): | |
'''Saves hourly counts form Twitter messages to MongoDB. | |
WARNING: This class is for illustration purposes only and will hammer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.cross_validation import train_test_split | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.metrics import f1_score | |
def goodFFS(X, y, nFeatures): | |
"""Proper forward feature selection. | |
Arguments: | |
X -- matrix containing feature vectors | |
y -- label data | |
nFeatures -- maximum number of features |
NewerOlder