Tal Yarkoni tyarkoni

## silly_pie_chart.py
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

rows = [
    ('Doing research', 50),
    ('Having meetings', 6),
    ('Begging funding agencies for money so I can keep my job', 3),
    ('Doing paperwork', 2),
    ('Reviewing papers', 2),

## p_hacked_effect_sizes.py
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline

def run_study(step_size=50, max_n=200, num_tests=10, alpha=0.05):
    ''' Run a single study in increments of N until we either (a) achieve
    significance, or (b) hit a maximum sample size. To model p-hacking, we
    conduct num_tests independent tests after each increment of sampling. '''
    X = np.zeros((0, num_tests))

## simulate_matching.py
'''
A small simulation to demonstrate that matching trials does not solve the
problem of residual confounding. For description of original problem, see
http://dx.doi.org/10.1371/journal.pone.0152719
Here we simulate a situation where we match trials from two conditions that
differ in Y on an indicator M. By hypothesis, there is no difference in Y in
the population after controlling for M. But because of measurement error,
matching on M will, on average, leave a residual mean difference in the Y's.
Raising the reliability of M (REL_M) will decrease this difference, and setting
it to 1.0 will eliminate it completely, demonstrating that matching works just

## t1_t2_correlation_sim.py
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt

g1_d_mu = 0.4
g1_d_sd = 0.4
prop_null = 0.3
n_subs = 20
n_studies = 400

## predict_from_text.py
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

# Grab just two categories from the 20 newsgroups dataset
categories=['sci.space', 'rec.autos']
	import matplotlib.pyplot as plt
	import seaborn as sns
	%matplotlib inline

	rows = [
	('Doing research', 50),
	('Having meetings', 6),
	('Begging funding agencies for money so I can keep my job', 3),
	('Doing paperwork', 2),
	('Reviewing papers', 2),
	import numpy as np
	from scipy import stats
	import matplotlib.pyplot as plt
	%matplotlib inline

	def run_study(step_size=50, max_n=200, num_tests=10, alpha=0.05):
	''' Run a single study in increments of N until we either (a) achieve
	significance, or (b) hit a maximum sample size. To model p-hacking, we
	conduct num_tests independent tests after each increment of sampling. '''
	X = np.zeros((0, num_tests))
	'''
	A small simulation to demonstrate that matching trials does not solve the
	problem of residual confounding. For description of original problem, see
	http://dx.doi.org/10.1371/journal.pone.0152719
	Here we simulate a situation where we match trials from two conditions that
	differ in Y on an indicator M. By hypothesis, there is no difference in Y in
	the population after controlling for M. But because of measurement error,
	matching on M will, on average, leave a residual mean difference in the Y's.
	Raising the reliability of M (REL_M) will decrease this difference, and setting
	it to 1.0 will eliminate it completely, demonstrating that matching works just
	import numpy as np
	import scipy.stats as ss
	import matplotlib.pyplot as plt

	g1_d_mu = 0.4
	g1_d_sd = 0.4
	prop_null = 0.3
	n_subs = 20
	n_studies = 400
	from sklearn.datasets import fetch_20newsgroups
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn.pipeline import Pipeline
	import pandas as pd
	import numpy as np

	# Grab just two categories from the 20 newsgroups dataset
	categories=['sci.space', 'rec.autos']