This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark import SparkContext | |
dataFile = "./sbin/start-master.sh" | |
sc = SparkContext("spark://ip-XXX-XX-X-XX.sa-east-1.compute.internal:7077", "Simple App") | |
textRdd = sc.textFile(dataFile) | |
print "Number of lines: ", textRdd.count() | |
print "Number of lines with 8080: ", textRdd.filter(lambda x : '8080' in x).count() | |
sc.stop() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_sample_from_distribution(n): | |
""" | |
Creates an array of size n. | |
Each value will be 1 if some person would vote on Alice and | |
0 if he would vote on Bob. | |
""" | |
sample_opinions = [] | |
for i in range(n): | |
sample_opinions.append(get_opinion_from_random_person()) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fig, axes = plt.subplots(2,3, figsize = (12, 8)) | |
fig.subplots_adjust(hspace=0.4, wspace=0.3) | |
axes = axes.ravel() | |
days = [5, 50, 500, 1000, 10000, 100000] | |
for i in range(len(days)): | |
sample_means = get_n_sample_means_from_distribution( | |
samples_number = days[i], | |
sample_size = 100) | |
axes[i].hist(sample_means, bins=30) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fig, axes = plt.subplots(2,3, figsize = (12, 8)) | |
fig.subplots_adjust(hspace=0.4, wspace=0.3) | |
axes = axes.ravel() | |
sample_size = [10, 100, 10000, 1000000, 10000000, 100000000000000] | |
for i in range(len(sample_size)): | |
sample_means = get_n_sample_means_from_distribution( | |
samples_number = 100, | |
sample_size = sample_size[i]) | |
axes[i].hist(sample_means, bins=30) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_margin_of_error_interval(sample_size, number_of_std_dvt): | |
day_sample = get_n_samples_from_distribution( | |
samples_number = 1, | |
sample_size = sample_size) | |
sample_mean = np.mean(day_sample) | |
sample_std_dvt = np.std(day_sample) | |
estimated_std_dvt = sample_std_dvt / math.sqrt(sample_size) | |
margin_of_error = number_of_std_dvt * estimated_std_dvt | |
return { "error": margin_of_error, "mean": sample_mean } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_n_samples_from_distribution(samples_number, sample_size): | |
""" | |
Get multiple samples from our target 'unknown distribution'. | |
""" | |
people_per_day = [sample_size] * samples_number | |
week_samples = list(map(get_sample_from_distribution, people_per_day)) | |
return week_samples | |
def get_n_sample_means_from_distribution(samples_number, sample_size): | |
samples = get_n_samples_from_distribution( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
x = [] | |
y = [] | |
y1 = [] | |
y2 = [] | |
fig, axes = plt.subplots(1, 2, figsize = (10, 6)) | |
axes = axes.ravel() | |
for power_of_ten in range(1, 8): | |
x.append(power_of_ten) | |
y.append(get_margin_of_error_interval(10 ** power_of_ten, 2)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
get_margin_of_error_interval(sample_size = 10000, number_of_std_dvt = 2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
import math | |
def get_opinion_from_random_person(): | |
# Usually this probabiliy is not known. Setting it here for simulation purposes. | |
probability_of_alice_win = 0.53 | |
return np.random.binomial( | |
n = 1, | |
p = probability_of_alice_win) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
from time import gmtime, strftime | |
from kafka import KafkaProducer | |
def publish_message(producer_instance, topic_name, key, value): | |
try: | |
key_bytes = bytes(key, encoding='utf-8') | |
value_bytes = bytes(value, encoding='utf-8') | |
producer_instance.send(topic_name, key=key_bytes, value=value_bytes) | |
producer_instance.flush() |
OlderNewer