Cameron Davidson-Pilon CamDavidsonPilon

## volunteer.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                CamDavidsonPilon
                / volunteer.ipynb
            
            
              Created
              April 15, 2015 23:16
            
              
                Volunteer Problem in PyMC
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## astro.py
from lifelines.utils import concordance_index
#assuming your data is in a dataframe

turnover_frequencies = df['turnover']
turnover_censored = df['turnover_censored'].astype(bool)
redshift = df['redshift']

ci = concordance_index(turnover_frequencies, redshift, turnover_censored)

print 2.0 * ci - 1.0

## data.tsv
pairid    lbwt    age    lastwt    race    smoke    ptd    ht    ui    race1    race2    race3
   1        0      14      135       1       0       0      0     0      1        0        0
   1        1      14      101       3       1       1      0     0      0        0        1
   2        0      15       98       2       0       0      0     0      0        1        0
   2        1      15      115       3       0       0      0     1      0        0        1
   3        0      16       95       3       0       0      0     0      0        0        1
   3        1      16      130       3       0       0      0     0      0        0        1
   4        0      17      103       3       0       0      0     0      0        0        1
   4        1      17      130       3       1       1      0     1      0        0        1
   5        0      17      122       1       1       0      0     0      1        0        0

## lifelines.py
from patsy import dmatrix
from lifelines import CoxPHFitter
import pandas as pd

df = pd.read_csv('/Users/camerondavidson-pilon/Downloads/prostate1.csv')
X = dmatrix('age + hg + sz + sg + rx + pf + status1 + dtime', df, return_type='dataframe')
print X.head()
"""
Notice patsy has removed the redundant variables: `0.2 mg estrogen` and `in bed < 50% daytime`. This is what R does too.
Patsy has introduced an Intercept column, though. We don't want this.

## pydata.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                CamDavidsonPilon
                / pydata.ipynb
            
            
              Created
              August 5, 2015 12:39
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## inverse_index.py

documents = sc.parallelize([
    ('0', "frequency: the frequency vector of customers' purchases (denoted x in literature)."),
    ('1', "recency: the recency vector of customers' purchases (denoted t_x in literature)."),
    ('2', "T: the vector of customers' age (time since first purchase)"),
    ('3', 'iterative_fitting: perform `iterative_fitting` additional fits to find the best'),
    ('4', 'parameters for the model. Setting to 0 will improve peformance but possibly'),
    ('5', 'hurt estimates.'),
    ('6', 'initial_params: set initial params for the iterative fitter.'),
    ('7', 'verbose: set to true to print out convergence diagnostics.'),

## contro.sql
declare @VoteStats table (parentid int, id int, U float, D float)

insert @VoteStats
SELECT
a.parentid,
a.id,
CAST(SUM(case when (VoteTypeID = 2) then 1. else 0. end) + 1. as float) as U,
CAST(SUM(case when (VoteTypeID = 3) then 1. else 0. end) + 1. as float) as D
FROM Posts q
JOIN PostTags qt

## cosine_similarity_the_hard_way.py
from operator import add
from itertools import combinations
from math import sqrt

def emit_pairs(words):
    for pair in combinations(words, 2):
        yield pair, 1

def cosine_similarity((w1,w2), cross_product, magnitudes):
    similarity =  cross_product/sqrt(magnitudes.value[w1])/sqrt(magnitudes.value[w2])

## mc.py
def sample(N=2):

    while True:
        S_N_minus_1 = sum([random() for _ in xrange(N-1)])
        if S_N_minus_1 >= 1:
            continue

        uN = random()
        if S_N_minus_1 + uN >= 1:
            return uN

## mod_binary_search.py
def mod_binary_search(round, previous_winner, dataset):
    # round starts at 0, previous_winner starts at 0
    if 2**round >= len(dataset):
        return previous_winner

    mod = 2 ** (round + 1)
    if test(dataset, previous_winner, mod):
        return mod_binary_search(round+1, previous_winner, dataset)
    else:
        return mod_binary_search(round+1, previous_winner + 2**round, dataset)
	from lifelines.utils import concordance_index
	#assuming your data is in a dataframe

	turnover_frequencies = df['turnover']
	turnover_censored = df['turnover_censored'].astype(bool)
	redshift = df['redshift']

	ci = concordance_index(turnover_frequencies, redshift, turnover_censored)

	print 2.0 * ci - 1.0
	pairid lbwt age lastwt race smoke ptd ht ui race1 race2 race3
	1 0 14 135 1 0 0 0 0 1 0 0
	1 1 14 101 3 1 1 0 0 0 0 1
	2 0 15 98 2 0 0 0 0 0 1 0
	2 1 15 115 3 0 0 0 1 0 0 1
	3 0 16 95 3 0 0 0 0 0 0 1
	3 1 16 130 3 0 0 0 0 0 0 1
	4 0 17 103 3 0 0 0 0 0 0 1
	4 1 17 130 3 1 1 0 1 0 0 1
	5 0 17 122 1 1 0 0 0 1 0 0
	from patsy import dmatrix
	from lifelines import CoxPHFitter
	import pandas as pd

	df = pd.read_csv('/Users/camerondavidson-pilon/Downloads/prostate1.csv')
	X = dmatrix('age + hg + sz + sg + rx + pf + status1 + dtime', df, return_type='dataframe')
	print X.head()
	"""
	Notice patsy has removed the redundant variables: `0.2 mg estrogen` and `in bed < 50% daytime`. This is what R does too.
	Patsy has introduced an Intercept column, though. We don't want this.

	documents = sc.parallelize([
	('0', "frequency: the frequency vector of customers' purchases (denoted x in literature)."),
	('1', "recency: the recency vector of customers' purchases (denoted t_x in literature)."),
	('2', "T: the vector of customers' age (time since first purchase)"),
	('3', 'iterative_fitting: perform `iterative_fitting` additional fits to find the best'),
	('4', 'parameters for the model. Setting to 0 will improve peformance but possibly'),
	('5', 'hurt estimates.'),
	('6', 'initial_params: set initial params for the iterative fitter.'),
	('7', 'verbose: set to true to print out convergence diagnostics.'),
	declare @VoteStats table (parentid int, id int, U float, D float)

	insert @VoteStats
	SELECT
	a.parentid,
	a.id,
	CAST(SUM(case when (VoteTypeID = 2) then 1. else 0. end) + 1. as float) as U,
	CAST(SUM(case when (VoteTypeID = 3) then 1. else 0. end) + 1. as float) as D
	FROM Posts q
	JOIN PostTags qt
	from operator import add
	from itertools import combinations
	from math import sqrt

	def emit_pairs(words):
	for pair in combinations(words, 2):
	yield pair, 1

	def cosine_similarity((w1,w2), cross_product, magnitudes):
	similarity = cross_product/sqrt(magnitudes.value[w1])/sqrt(magnitudes.value[w2])
	def sample(N=2):

	while True:
	S_N_minus_1 = sum([random() for _ in xrange(N-1)])
	if S_N_minus_1 >= 1:
	continue

	uN = random()
	if S_N_minus_1 + uN >= 1:
	return uN
	def mod_binary_search(round, previous_winner, dataset):
	# round starts at 0, previous_winner starts at 0
	if 2**round >= len(dataset):
	return previous_winner

	mod = 2 ** (round + 1)
	if test(dataset, previous_winner, mod):
	return mod_binary_search(round+1, previous_winner, dataset)
	else:
	return mod_binary_search(round+1, previous_winner + 2**round, dataset)