Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Buttonwood/054d99a0778c9c8382c5a8107db76a08 to your computer and use it in GitHub Desktop.
Save Buttonwood/054d99a0778c9c8382c5a8107db76a08 to your computer and use it in GitHub Desktop.
Example of how to get info on Hive YARN jobs for a specific Sentry user using the Cloudera Manager API
#!/usr/bin/python
## ********************************************************************************
## get-hive-yarn-jobs-for-sentry-user.py
##
## Example of how to retrieve info on YARN Hive jobs for a given Sentry user
## using the Cloudera Manager API
##
## Usage: ./get-hive-yarn-jobs-for-sentry-user.py <sentry_user_name>
##
## <sentry_user_name> is the name of the user you want to retrieve info for
##
## for example: ./get-hive-yarn-jobs-for-sentry-user.py mark
##
## Edit the settings in the script to connect to your Cluster
##
## The script assumes one YARN Service exists on the Cluster
##
## ********************************************************************************
import sys
import time
import pprint
from datetime import datetime, timedelta
from cm_api.api_client import ApiResource
## Settings to connect to the cluster
cm_host = "CM_HOST"
cm_port = "7180"
cm_login = "CM_USER"
cm_password = "CM_PASSWORD"
cluster_name = "CLUSTER_NAME"
## Get command line args
sentry_user_name = None
if len(sys.argv) == 2:
sentry_user_name = sys.argv[1]
else:
print " Usage: ./get-hive-yarn-jobs-for-sentry-user.py <sentry_user_name>"
quit(1)
## Used for formatting dates
fmt = '%Y-%m-%d %H:%M:%S %Z'
# pretty printer for printing JSON attribute lists
pp = pprint.PrettyPrinter(indent=4)
## Connect to CM
print "\nConnecting to Cloudera Manager at " + cm_host + ":" + cm_port
api = ApiResource(server_host=cm_host, server_port=cm_port, username=cm_login, password=cm_password)
## Get Cluster
cluster = None
clusters = api.get_all_clusters()
for c in clusters:
if c.displayName == cluster_name:
cluster = c
break
if cluster is None:
print "\nError: Cluster '" + cluster_name + "' not found"
quit(1)
## Get YARN Service
yarn_service = None
service_list = cluster.get_all_services()
for service in service_list:
if service.type == "YARN":
yarn_service = service
break
if yarn_service is None:
print "Error: Could not locate YARN Service"
quit(1)
## I'll configure this example to use a window of one day
now = datetime.utcnow()
start = now - timedelta(days=1)
## Use the hive user to select only the Hive jobs;
## We'll look for the sentry_user_name after we get the results
filterStr = 'user = hive'
## Get the YARN Hive jobs
yarn_apps_response = yarn_service.get_yarn_applications(start_time=start, end_time=now, filter_str=filterStr, limit=1000)
yarn_apps = yarn_apps_response.applications
## Iterate over the jobs
for i in range (0, len(yarn_apps)):
yarn_app = yarn_apps[i]
## Change 'False' to 'True' in the line below to see all of the YARN attributes available for each job
## if you see ones you want to print, refer to them like the 'hive_sentry_subject_name' example below
if False:
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(yarn_app.attributes)
## Get the Sentry user for the job
sentry_subject = yarn_app.attributes['hive_sentry_subject_name']
## print the jobs only for the Sentry user we are looking for
if sentry_subject is None or sentry_subject != sentry_user_name:
break
print "\n-- YARN Job ID: " + yarn_app.applicationId + " --------------"
print "YARN App Name: " + yarn_app.name
print "YARN User: " + yarn_app.user
print "Sentry User: " + sentry_subject
## I'll print a couple more YARN app attributes
hive_query_string = yarn_app.attributes['hive_query_string']
if hive_query_string is not None:
print "Hive Query: " + hive_query_string
cpu_millis = yarn_app.attributes['cpu_milliseconds']
if cpu_millis is not None:
print "CPU Millis: " + cpu_millis
print "\n-----------------------------------------"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment