Created
May 18, 2014 05:05
-
-
Save tartakynov/539646a7cc54edaf7aaa to your computer and use it in GitHub Desktop.
Calculates required number of Hadoop data nodes by month
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from math import ceil | |
from tabulate import tabulate | |
GiB = 1024 | |
TiB = 1048576 | |
def main(): | |
""" | |
Calculates cluster growth plan based on numbers listed below | |
""" | |
dailyIngest = 30 * GiB # average daily ingest rate | |
replication = 3 # replication factor (by default 3) | |
reserve = 0.25 # MapReduce temp space reserve (usually 1/3 or 1/4) | |
nodeStorage = 6 * TiB # amount of disk space per node | |
growthRate = 0.05 # growth of ingest rate per month | |
results = [] | |
m = (dailyIngest * replication * 365 / 12.0) | |
for month in range(1, 13): | |
s = m * (month if (growthRate == 0) else (pow(1 + growthRate, month) - 1) / ((1 + growthRate) - 1)) | |
total = s / (1 - reserve) | |
results.append([month, humanReadableSize(total), ceil(total / nodeStorage)]) | |
print tabulate(results, headers=["Month", "Total amount of space", "Number of data nodes"]) | |
def humanReadableSize(num): | |
for x in ['MiB', 'GiB', 'TiB', 'PiB', 'EiB']: | |
if num < 1024.0: | |
return "%3.1f%s" % (num, x) | |
num /= 1024.0 | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Let's assume that you're planning to build a cluster with 6 TiB data nodes.
If you expect to store 30 GiB daily on average and amount of incoming data is expected to grow 5% per month, then you would need min 10 nodes on first year (with all other options set to default).