Skip to content

Instantly share code, notes, and snippets.

Avatar

Waldecir Faria wfaria

View GitHub Profile
@wfaria
wfaria / PySparkTest.py
Created Aug 23, 2018
PySpark test code
View PySparkTest.py
from pyspark import SparkContext
dataFile = "./sbin/start-master.sh"
sc = SparkContext("spark://ip-XXX-XX-X-XX.sa-east-1.compute.internal:7077", "Simple App")
textRdd = sc.textFile(dataFile)
print "Number of lines: ", textRdd.count()
print "Number of lines with 8080: ", textRdd.filter(lambda x : '8080' in x).count()
sc.stop()
@wfaria
wfaria / get_sample_from_distribution.py
Created Mar 9, 2019
Election Poll Simulation - Getting a Sample of size 10
View get_sample_from_distribution.py
def get_sample_from_distribution(n):
"""
Creates an array of size n.
Each value will be 1 if some person would vote on Alice and
0 if he would vote on Bob.
"""
sample_opinions = []
for i in range(n):
sample_opinions.append(get_opinion_from_random_person())
@wfaria
wfaria / sample_histogram.py
Created Mar 9, 2019
Election Poll Simulation - Sample Histogram
View sample_histogram.py
fig, axes = plt.subplots(2,3, figsize = (12, 8))
fig.subplots_adjust(hspace=0.4, wspace=0.3)
axes = axes.ravel()
days = [5, 50, 500, 1000, 10000, 100000]
for i in range(len(days)):
sample_means = get_n_sample_means_from_distribution(
samples_number = days[i],
sample_size = 100)
axes[i].hist(sample_means, bins=30)
@wfaria
wfaria / election_poll_sample_histogram2.py
Created Mar 9, 2019
Election Poll Simulation - Another Histogram with fixed sample number
View election_poll_sample_histogram2.py
fig, axes = plt.subplots(2,3, figsize = (12, 8))
fig.subplots_adjust(hspace=0.4, wspace=0.3)
axes = axes.ravel()
sample_size = [10, 100, 10000, 1000000, 10000000, 100000000000000]
for i in range(len(sample_size)):
sample_means = get_n_sample_means_from_distribution(
samples_number = 100,
sample_size = sample_size[i])
axes[i].hist(sample_means, bins=30)
@wfaria
wfaria / get_margin_of_error_interval.py
Created Mar 10, 2019
Election Poll - margin of error method
View get_margin_of_error_interval.py
def get_margin_of_error_interval(sample_size, number_of_std_dvt):
day_sample = get_n_samples_from_distribution(
samples_number = 1,
sample_size = sample_size)
sample_mean = np.mean(day_sample)
sample_std_dvt = np.std(day_sample)
estimated_std_dvt = sample_std_dvt / math.sqrt(sample_size)
margin_of_error = number_of_std_dvt * estimated_std_dvt
return { "error": margin_of_error, "mean": sample_mean }
@wfaria
wfaria / get_n_samples_from_distribution.py
Last active Mar 10, 2019
Election Poll Simulation - Getting multiple samples
View get_n_samples_from_distribution.py
def get_n_samples_from_distribution(samples_number, sample_size):
"""
Get multiple samples from our target 'unknown distribution'.
"""
people_per_day = [sample_size] * samples_number
week_samples = list(map(get_sample_from_distribution, people_per_day))
return week_samples
def get_n_sample_means_from_distribution(samples_number, sample_size):
samples = get_n_samples_from_distribution(
View margin_of_error_comp.py
x = []
y = []
y1 = []
y2 = []
fig, axes = plt.subplots(1, 2, figsize = (10, 6))
axes = axes.ravel()
for power_of_ten in range(1, 8):
x.append(power_of_ten)
y.append(get_margin_of_error_interval(10 ** power_of_ten, 2))
View final_election_poll_check.py
get_margin_of_error_interval(sample_size = 10000, number_of_std_dvt = 2)
@wfaria
wfaria / get_opinion_from_random_person.py
Last active Mar 10, 2019
Election Poll Simulation - Asking opinion from a single person.
View get_opinion_from_random_person.py
import numpy as np
import matplotlib.pyplot as plt
import math
def get_opinion_from_random_person():
# Usually this probabiliy is not known. Setting it here for simulation purposes.
probability_of_alice_win = 0.53
return np.random.binomial(
n = 1,
p = probability_of_alice_win)
@wfaria
wfaria / HeartbeatProducer.py
Last active May 3, 2020
Kafka Producer example for Medium article
View HeartbeatProducer.py
import time
from time import gmtime, strftime
from kafka import KafkaProducer
def publish_message(producer_instance, topic_name, key, value):
try:
key_bytes = bytes(key, encoding='utf-8')
value_bytes = bytes(value, encoding='utf-8')
producer_instance.send(topic_name, key=key_bytes, value=value_bytes)
producer_instance.flush()
You can’t perform that action at this time.