Skip to content

Instantly share code, notes, and snippets.

View wfaria's full-sized avatar

Waldecir Faria wfaria

View GitHub Profile
get_margin_of_error_interval(sample_size = 10000, number_of_std_dvt = 2)
x = []
y = []
y1 = []
y2 = []
fig, axes = plt.subplots(1, 2, figsize = (10, 6))
axes = axes.ravel()
for power_of_ten in range(1, 8):
x.append(power_of_ten)
y.append(get_margin_of_error_interval(10 ** power_of_ten, 2))
@wfaria
wfaria / get_margin_of_error_interval.py
Created March 10, 2019 00:50
Election Poll - margin of error method
def get_margin_of_error_interval(sample_size, number_of_std_dvt):
day_sample = get_n_samples_from_distribution(
samples_number = 1,
sample_size = sample_size)
sample_mean = np.mean(day_sample)
sample_std_dvt = np.std(day_sample)
estimated_std_dvt = sample_std_dvt / math.sqrt(sample_size)
margin_of_error = number_of_std_dvt * estimated_std_dvt
return { "error": margin_of_error, "mean": sample_mean }
@wfaria
wfaria / election_poll_sample_histogram2.py
Created March 9, 2019 23:32
Election Poll Simulation - Another Histogram with fixed sample number
fig, axes = plt.subplots(2,3, figsize = (12, 8))
fig.subplots_adjust(hspace=0.4, wspace=0.3)
axes = axes.ravel()
sample_size = [10, 100, 10000, 1000000, 10000000, 100000000000000]
for i in range(len(sample_size)):
sample_means = get_n_sample_means_from_distribution(
samples_number = 100,
sample_size = sample_size[i])
axes[i].hist(sample_means, bins=30)
@wfaria
wfaria / sample_histogram.py
Created March 9, 2019 23:16
Election Poll Simulation - Sample Histogram
fig, axes = plt.subplots(2,3, figsize = (12, 8))
fig.subplots_adjust(hspace=0.4, wspace=0.3)
axes = axes.ravel()
days = [5, 50, 500, 1000, 10000, 100000]
for i in range(len(days)):
sample_means = get_n_sample_means_from_distribution(
samples_number = days[i],
sample_size = 100)
axes[i].hist(sample_means, bins=30)
@wfaria
wfaria / get_n_samples_from_distribution.py
Last active March 10, 2019 00:59
Election Poll Simulation - Getting multiple samples
def get_n_samples_from_distribution(samples_number, sample_size):
"""
Get multiple samples from our target 'unknown distribution'.
"""
people_per_day = [sample_size] * samples_number
week_samples = list(map(get_sample_from_distribution, people_per_day))
return week_samples
def get_n_sample_means_from_distribution(samples_number, sample_size):
samples = get_n_samples_from_distribution(
@wfaria
wfaria / get_sample_from_distribution.py
Created March 9, 2019 23:09
Election Poll Simulation - Getting a Sample of size 10
def get_sample_from_distribution(n):
"""
Creates an array of size n.
Each value will be 1 if some person would vote on Alice and
0 if he would vote on Bob.
"""
sample_opinions = []
for i in range(n):
sample_opinions.append(get_opinion_from_random_person())
@wfaria
wfaria / get_opinion_from_random_person.py
Last active March 10, 2019 01:35
Election Poll Simulation - Asking opinion from a single person.
import numpy as np
import matplotlib.pyplot as plt
import math
def get_opinion_from_random_person():
# Usually this probabiliy is not known. Setting it here for simulation purposes.
probability_of_alice_win = 0.53
return np.random.binomial(
n = 1,
p = probability_of_alice_win)
@wfaria
wfaria / HeartbeatStreaming.py
Created August 23, 2018 17:36
Spark Streaming with Kafka example for a Medium article using Python
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
def main():
# Task configuration.
topic = "heartbeat"
brokerAddresses = "localhost:9092"
batchTime = 20
@wfaria
wfaria / PySparkTest.py
Created August 23, 2018 17:24
PySpark test code
from pyspark import SparkContext
dataFile = "./sbin/start-master.sh"
sc = SparkContext("spark://ip-XXX-XX-X-XX.sa-east-1.compute.internal:7077", "Simple App")
textRdd = sc.textFile(dataFile)
print "Number of lines: ", textRdd.count()
print "Number of lines with 8080: ", textRdd.filter(lambda x : '8080' in x).count()
sc.stop()