Skip to content

Instantly share code, notes, and snippets.

@dangmai
Created February 18, 2013 22:16
Show Gist options
  • Save dangmai/4981268 to your computer and use it in GitHub Desktop.
Save dangmai/4981268 to your computer and use it in GitHub Desktop.
Simulation for routing performances in Python using different routing methods. Inspired by RapGenius R [gist](https://gist.github.com/toddwschneider/4947326).
#!/usr/bin/env python
"""
Simulation of Heroku performances under different routing mechanisms (random/
naive vs intelligent). Inspired by RapGenius' simulation, which can be found
at http://rapgenius.com/James-somers-herokus-ugly-secret-lyrics.
This simulation uses numpy, scipy and pandas for data management/processing,
and it has slightly different algorithm than the simulation by RapGenius,
though the result is very similar.
"""
import csv
import numpy as np
from random import randint
from pandas import DataFrame, Series
from scipy.stats import poisson
import math
NAIVE_ROUTE = "Naive"
INTELLIGENT_ROUTE = "Intelligent"
def simulation(route_mode=NAIVE_ROUTE, reqs_per_min=9000, sim_time_mins=5,
dyno_nums=100):
"""
Run the simulation, returning an array of requests, including data that can
be aggregated on.
"""
reqs_per_ms = float(reqs_per_min) / 60000
sim_time = sim_time_mins * 60000
# How many starting requests at time t (in ms)
requests_at_time = poisson.rvs(reqs_per_ms, size=sim_time)
# "Unlist" the above poisson distribution to create a list of request
# starting times
start_times = []
for index, reqs_num in enumerate(requests_at_time):
# index is the start time of the request
for i in range(reqs_num):
start_times.append(index)
starting_time_series = Series(start_times)
# Total number of requests
total_reqs = len(starting_time_series)
# Using the Weibull distribution to sample how long the requests take
# As per RapGenius recommendation, the shape param is 0.46
wshape = 0.46
# Not really sure why these params are used to determine the scale factors
wlambda = 50 / (math.log(2) ** (1 / wshape))
# Right now, don't limit the min and max for request durations
theoretical_duration = np.random.weibull(0.46, total_reqs)
# Scale the request durations according to the scale factor wlambda
theoretical_duration = map(lambda x: x * wlambda, theoretical_duration)
theoretical_duration_series = Series(theoretical_duration)
requests = DataFrame({
'starting_time': starting_time_series,
'theoretical_duration': theoretical_duration_series
})
# Calculate the ending times for the requests, depending on the route
# method
# This contains the next available information for dynos
dynos = [0] * dyno_nums
ending_times = []
# Theoretically, we can figure out whether a request is queued or not, by
# checking whether (ending_time - starting_time) == theretical_duration,
# however, that involves floating point arithmetic and somehow gives wrong
# answer. So here we introduce a Series that keep track whether a request
# is queued or not.
queued = []
for index, request in requests.iterrows():
row_queued = False
if route_mode is NAIVE_ROUTE:
# In naive mode, choose a random dyno index uniformly.
dyno_index = randint(0, len(dynos) - 1)
else:
# In intelligent mode, choose a dyno that is either currently
# available, or will be available the soonest.
dyno_index = dynos.index(min(dynos))
start_at = request["starting_time"]
if dynos[dyno_index] > request["starting_time"]:
row_queued = True
start_at = dynos[dyno_index]
ending_time = start_at + request["theoretical_duration"]
dynos[dyno_index] = ending_time
ending_times.append(ending_time)
queued.append(row_queued)
requests['ending_time'] = Series(ending_times)
requests['queued'] = Series(queued)
requests['real_duration'] = requests['ending_time'] - \
requests['starting_time']
requests['time_in_queue'] = requests['real_duration'] - \
requests['theoretical_duration']
# Metadata
requests.dynos = dyno_nums
requests.mode = route_mode
return requests
"""Below are the functions to aggregate the data"""
def aggfunc_request_queued_nums(result):
s = result["queued"]
return round((float(len(s[s == True])) / len(result) * 100))
def aggfunc_mean_time_in_queue(result):
return round(result["time_in_queue"].mean())
def aggfunc_median_time_in_queue(result):
return round(result["time_in_queue"].median())
def aggfunc_mean_time_in_queue_if_queued(result):
queued = result[result["queued"] == True]
s = queued["time_in_queue"]
return round(s.mean())
def aggfunc_median_time_in_queue_if_queued(result):
queued = result[result["queued"] == True]
s = queued["time_in_queue"]
return round(s.median())
def to_csv(*results):
"""
Export a summary of the results to a CSV file.
"""
fields = {
"# of dynos": lambda result: result.dynos,
"Routing mode": lambda result: result.mode,
"% of requests queued": aggfunc_request_queued_nums,
"Mean time in queue": aggfunc_mean_time_in_queue,
"Mean time in queue if queued": aggfunc_mean_time_in_queue_if_queued,
"Median time in queue": aggfunc_median_time_in_queue,
"Median time in queue if queued": aggfunc_median_time_in_queue_if_queued
}
with open('result.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(fields.keys())
for result in results:
row = []
for aggfunc in fields.itervalues():
row.append(aggfunc(result))
writer.writerow(row)
def main():
to_csv(
simulation(route_mode=INTELLIGENT_ROUTE, dyno_nums=50),
simulation(route_mode=INTELLIGENT_ROUTE, dyno_nums=60),
simulation(route_mode=INTELLIGENT_ROUTE, dyno_nums=75),
simulation(route_mode=NAIVE_ROUTE, dyno_nums=75),
simulation(route_mode=NAIVE_ROUTE, dyno_nums=100),
simulation(route_mode=NAIVE_ROUTE, dyno_nums=150),
simulation(route_mode=NAIVE_ROUTE, dyno_nums=200),
simulation(route_mode=NAIVE_ROUTE, dyno_nums=500),
simulation(route_mode=NAIVE_ROUTE, dyno_nums=1000),
simulation(route_mode=NAIVE_ROUTE, dyno_nums=2000),
simulation(route_mode=NAIVE_ROUTE, dyno_nums=4000)
)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment