matthewpick/emr_open_spark_web_ui.py

## emr_open_spark_web_ui.py
import logging

import boto3
import webbrowser


logger = logging.getLogger(__name__)


def main():
    master_instance_type = 'm5.xlarge'
    logger.info("Must be on VPN to view webpages!")

    clusters = find_clusters(name='production-streaming-cluster')

    for cluster in clusters:
        app_ids = find_application_ids(cluster['Id'])
        ec2_instances = find_cluster_ec2_instances(cluster['Id'])

        primary_instance = [instance for instance in ec2_instances if instance['InstanceType'] == master_instance_type][0]
        dns_name = primary_instance['PrivateDnsName']

        for app_id in app_ids:
            spark_url = f'http://{dns_name}:20888/proxy/{app_id}/streaming/'
            hadoop_url = f'http://{dns_name}:8088/cluster/app/{app_id}/'
            logger.info("Opening Spark UI %s", spark_url)
            logger.info("Opening Hadoop UI %s", hadoop_url)
            webbrowser.open(spark_url)
            webbrowser.open(hadoop_url)


def find_cluster_ec2_instances(job_flow_id):
    client = boto3.client('ec2')

    custom_filter = [{
        'Name': 'tag:aws:elasticmapreduce:job-flow-id',
        'Values': [job_flow_id]}]

    response = client.describe_instances(Filters=custom_filter)

    # NOTE: May need extra work for non-spot instances! e.g. "Reservations" key is specific to spot instances
    for group in response.get('Reservations', []):
        for instance in group.get('Instances', []):
            yield instance


def find_application_ids(cluster_id):
    emr_client = boto3.client('emr')
    cluster_details = emr_client.describe_cluster(ClusterId=cluster_id)
    s3_path_parts = cluster_details['Cluster']['LogUri'].replace('s3://', '').replace('s3n://', '').split('/')

    bucket_name = s3_path_parts[0]
    sub_path = "/".join(s3_path_parts[1:-1])
    containers_sub_path = f'{sub_path}/{cluster_id}/containers/'

    s3_client = boto3.client('s3')

    result = s3_client.list_objects(Bucket=bucket_name, Prefix=containers_sub_path, Delimiter='/')
    for prefix in result.get('CommonPrefixes', []):
        app_id = prefix['Prefix'].split('/')[-2]
        yield app_id


def find_clusters(name=None):
    client = boto3.client('emr')

    clusters = client.list_clusters(ClusterStates=['RUNNING', 'WAITING']).get('Clusters', [])

    if not clusters:
        logger.info('No Clusters RUNNING')
        return None

    for cluster in clusters:
        if name:
            if cluster['Name'] == name:
                yield cluster
        else:
            yield cluster


if __name__ == "__main__":
    main()
	import logging

	import boto3
	import webbrowser


	logger = logging.getLogger(__name__)


	def main():
	master_instance_type = 'm5.xlarge'
	logger.info("Must be on VPN to view webpages!")

	clusters = find_clusters(name='production-streaming-cluster')

	for cluster in clusters:
	app_ids = find_application_ids(cluster['Id'])
	ec2_instances = find_cluster_ec2_instances(cluster['Id'])

	primary_instance = [instance for instance in ec2_instances if instance['InstanceType'] == master_instance_type][0]
	dns_name = primary_instance['PrivateDnsName']

	for app_id in app_ids:
	spark_url = f'http://{dns_name}:20888/proxy/{app_id}/streaming/'
	hadoop_url = f'http://{dns_name}:8088/cluster/app/{app_id}/'
	logger.info("Opening Spark UI %s", spark_url)
	logger.info("Opening Hadoop UI %s", hadoop_url)
	webbrowser.open(spark_url)
	webbrowser.open(hadoop_url)


	def find_cluster_ec2_instances(job_flow_id):
	client = boto3.client('ec2')

	custom_filter = [{
	'Name': 'tag:aws:elasticmapreduce:job-flow-id',
	'Values': [job_flow_id]}]

	response = client.describe_instances(Filters=custom_filter)

	# NOTE: May need extra work for non-spot instances! e.g. "Reservations" key is specific to spot instances
	for group in response.get('Reservations', []):
	for instance in group.get('Instances', []):
	yield instance


	def find_application_ids(cluster_id):
	emr_client = boto3.client('emr')
	cluster_details = emr_client.describe_cluster(ClusterId=cluster_id)
	s3_path_parts = cluster_details['Cluster']['LogUri'].replace('s3://', '').replace('s3n://', '').split('/')

	bucket_name = s3_path_parts[0]
	sub_path = "/".join(s3_path_parts[1:-1])
	containers_sub_path = f'{sub_path}/{cluster_id}/containers/'

	s3_client = boto3.client('s3')

	result = s3_client.list_objects(Bucket=bucket_name, Prefix=containers_sub_path, Delimiter='/')
	for prefix in result.get('CommonPrefixes', []):
	app_id = prefix['Prefix'].split('/')[-2]
	yield app_id


	def find_clusters(name=None):
	client = boto3.client('emr')

	clusters = client.list_clusters(ClusterStates=['RUNNING', 'WAITING']).get('Clusters', [])

	if not clusters:
	logger.info('No Clusters RUNNING')
	return None

	for cluster in clusters:
	if name:
	if cluster['Name'] == name:
	yield cluster
	else:
	yield cluster


	if __name__ == "__main__":
	main()