Skip to content

Instantly share code, notes, and snippets.

@utdemir
Created November 23, 2018 03:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save utdemir/e806b03edbebe45e8bc2e59f1a11046a to your computer and use it in GitHub Desktop.
Save utdemir/e806b03edbebe45e8bc2e59f1a11046a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import re
import sys
import argparse
from typing import *
from pprint import pprint
from multiprocessing import Pool
from datetime import datetime, timedelta
import boto3
from botocore.config import Config
import requests
from tqdm import tqdm
config = Config(
retries = {
"max_attempts": 24
}
)
emr = boto3.client('emr', config=config)
region = boto3.session.Session().region_name
InstanceType = NewType("InstanceType", str)
ClusterId = NewType("ClusterId", str)
class ClusterMeta(NamedTuple):
id: ClusterId
name: str
start: datetime
class Cluster(NamedTuple):
meta: ClusterMeta
instances: List[Tuple[InstanceType, timedelta]]
def list_clusters(after: datetime) -> List[ClusterMeta]:
paginator = emr.get_paginator("list_clusters")
iterator = paginator.paginate(
CreatedAfter=after,
ClusterStates=[ 'TERMINATED', 'TERMINATED_WITH_ERRORS' ],
),
ret = []
with tqdm(desc="Found clusters: ") as bar:
for page in iterator:
for content in page:
for cluster in content["Clusters"]:
ret.append(
ClusterMeta(
cluster['Id'],
cluster["Name"],
cluster['Status']['Timeline']['CreationDateTime']
)
)
bar.update()
return ret
def cluster_instances(cid: ClusterId) -> List[Tuple[InstanceType, timedelta]]:
ret = []
for i in emr.list_instances(ClusterId=cid)["Instances"]:
ty = InstanceType(i["InstanceType"])
start = i["Status"]["Timeline"]["CreationDateTime"]
end = i["Status"]["Timeline"]["EndDateTime"]
ret.append((ty, end-start))
return ret
#####
Pricing = NewType("Pricing", Dict[InstanceType, float])
def get_pricing() -> Pricing:
with tqdm(desc="Downloading pricing...", total=1) as bar:
resp = requests.get("https://raw.githubusercontent.com/powdahound/ec2instances.info/master/www/instances.json")
ret = {}
for i in resp.json():
ty = InstanceType(i["instance_type"])
region_pricing = i["pricing"].get(region)
if not region_pricing: continue
emr_pricing = region_pricing.get("emr")
if not emr_pricing: continue
total = float(emr_pricing["emr"]) + float(emr_pricing["ec2"])
ret[ty] = total
bar.update()
return Pricing(ret)
def cluster_cost(pricing: Pricing, cluster: Cluster) -> float:
total = 0.0
for ty, d in cluster.instances:
total += pricing[ty] * d.seconds / 60 / 60
return total
#####
patterns = map(lambda s: re.compile(s), sys.argv[1:])
if not patterns:
print("Usage: ./emr-cost-calculator.py [PATTERN...]", file=sys.stderr)
sys.exit(1)
pricing = get_pricing()
metas = list_clusters(datetime.now() - timedelta(days=7))
for pattern in patterns:
relevant = [i for i in metas if pattern.match(i.name)]
clusters = [
Cluster(meta, cluster_instances(meta.id))
for meta in tqdm(relevant, desc=f"Fetching instances for '{pattern.pattern}'")
]
costs = [(c, cluster_cost(pricing, c)) for c in clusters]
total_cost = sum(cost for (_, cost) in costs)
print(total_cost)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment