Skip to content

Instantly share code, notes, and snippets.

@dhesse
dhesse / gist:856d4cc9311befc81b04
Created May 14, 2015 19:42
Automated multipat glacier upload
UPLOADID=xxxxx #Replace with the id you got from initiate-multipart-upload
DONE=0
for i in x*
do
echo $i
read SIZE _ <<< $(wc -c $i)
((UPPER = SIZE + DONE - 1))
aws glacier upload-multipart-part --account-id -\
--vault-name Photos --body $i --range 'bytes $DONE-$UPPER/*'\
--upload-id $UPLOADID
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
def badFFS(X, y, nFeatures):
"""Broken forward feature selection. DO NOT USE.
This is only to demonstrate a pitfall.
Arguments
X -- matrix containing feature vectors
y -- label data
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
def goodFFS(X, y, nFeatures):
"""Proper forward feature selection.
Arguments:
X -- matrix containing feature vectors
y -- label data
nFeatures -- maximum number of features
from pymongo import MongoClient
from collections import defaultdict
from dateutil.parser import parse
from random import randint
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
class HourCounter(object):
'''Saves hourly counts form Twitter messages to MongoDB.
WARNING: This class is for illustration purposes only and will hammer
@dhesse
dhesse / getfriends.py
Last active September 16, 2015 13:50
"""Simple demo script explaining how to extract twitter users'
friend lists using the python twitter package."""
import time
import twitter
api = twitter.Api(consumer_key='your_key',
consumer_secret='your_secret',
access_token_key='your_token_key',
access_token_secret='your_token_secret')
while not api.GetRateLimitStatus()\
['resources']['friends']['/friends/ids']['remaining']:
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('data/Greenville_County_School_District_Spending.csv',
converters={'Amount': lambda x: float(x.replace('$', ''))},
parse_dates=9)
singleDigitCounts = data\
.Amount[data.Amount >= 1]\
.apply(lambda x: int(str(x)[:1]))\
.value_counts()
# you'll need the Greenville gist as well:
# https://gist.github.com/dhesse/aa2e2425548bf3e4ceb7
from greenville import compareVisuallyToBenford, data
doubleDigitCounts = data\
.Amount[data.Amount >= 10]\
.apply(lambda x: int(str(x)[:2]))\
.value_counts()
compareVisuallyToBenford(doubleDigitCounts, 'Counts of First Two Digits')
# you'll need the Greenville gist as well:
# https://gist.github.com/dhesse/aa2e2425548bf3e4ceb7
from greenville import compareVisuallyToBenford, data
indices = data.Amount >= 10
vc25 = data[indices][data.Amount[indices]\
.apply(lambda x: int(str(x)[:2])) == 25]['Vendor Name'].value_counts()
print vc25[vc25 > 10]
import collection.JavaConversions._
import org.apache.hadoop.conf.Configuration
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd.RDD
import org.bson.BSONObject
import org.bson.types.BasicBSONList
import java.io._
import com.mongodb.hadoop.{
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.