Skip to content

Instantly share code, notes, and snippets.

@jsoref
Forked from fotinakis/Jenkins auto-shudown-slaves job
Last active December 19, 2017 01:24
Show Gist options
  • Save jsoref/3ba9be087e25b6a4b6369bc68b4a8855 to your computer and use it in GitHub Desktop.
Save jsoref/3ba9be087e25b6a4b6369bc68b4a8855 to your computer and use it in GitHub Desktop.
Auto-managed Jenkins slaves on Google Compute Engine
#!/usr/bin/env python
import logging
import os
import sys
import time
import argparse
import httplib2
from os.path import expanduser
from oauth2client.client import GoogleCredentials
from oauth2client.client import flow_from_clientsecrets
from oauth2client.file import Storage
from oauth2client import tools
from oauth2client.tools import run_flow
from apiclient.discovery import build
# New instance properties
DEFAULT_MACHINE_TYPE = 'n1-standard-4'
DEFAULT_NETWORK = 'default'
DEFAULT_SERVICE_EMAIL = 'default'
DEFAULT_SCOPES = ['https://www.googleapis.com/auth/devstorage.full_control',
'https://www.googleapis.com/auth/compute']
# New root persistent disk properties
DEFAULT_SNAPSHOT = 'your-snapshot-base-image'
DEFAULT_ZONE = 'us-central1-b'
API_VERSION = 'v1'
GCE_URL = 'https://www.googleapis.com/compute/%s/projects/' % (API_VERSION)
PROJECT_ID = 'your-project-id-here'
OAUTH_STORE = expanduser("~/.jenkins-agent-manager")
CLIENT_SECRETS = os.path.join(OAUTH_STORE, 'client_secrets.json')
OAUTH2_STORAGE = os.path.join(OAUTH_STORE, 'oauth2.dat')
GCE_SCOPE = 'https://www.googleapis.com/auth/compute'
def main(argv):
logging.basicConfig(level=logging.WARN)
# Print to stderr because Jenkins agent output is funky.
print >> sys.stderr, 'Starting script...'
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
parents=[tools.argparser])
parser.add_argument('action', choices=['up', 'down', 'list'])
parser.add_argument('--instance_name', required=False)
parser.add_argument('--debug', required=False, action='store_true')
# Parse the command-line flags.
flags = parser.parse_args(argv[1:])
if not flags.instance_name and flags.action in ['up', 'down']:
parser.print_help()
sys.exit()
instance_name = flags.instance_name
if flags.action in ['up', 'down'] and not instance_name.startswith('node'):
print >> sys.stderr, "This script is restricted to build nodes, not %s" % instance_name
sys.exit(-1)
if flags.debug:
logging.basicConfig(level=logging.DEBUG)
httplib2.debuglevel = 2
if True:
# Perform OAuth 2.0 authorization.
flow = flow_from_clientsecrets(CLIENT_SECRETS, scope=GCE_SCOPE)
storage = Storage(OAUTH2_STORAGE)
credentials = storage.get()
else:
credentials = GoogleCredentials.get_application_default()
print >> sys.stderr, 'Checking for stored OAuth2 credentials...'
print >> sys.stderr, 'If agent startup hangs here, you probably need to manually login and run the agent.up script once to populate oauth2.dat.'
if credentials is None or credentials.invalid:
credentials = run_flow(flow, storage, flags)
http = httplib2.Http()
auth_http = credentials.authorize(http)
print >> sys.stderr, 'Got credentials!'
# Build the service
gce_service = build('compute', API_VERSION, cache_discovery=False)
project_url = '%s%s' % (GCE_URL, PROJECT_ID)
# Construct URLs
zone_url = '%s/zones/%s' % (project_url, DEFAULT_ZONE)
disk_source_url = '%s/zones/%s/disks/%s' % (
project_url, DEFAULT_ZONE, instance_name) # Disk name matches instance name.
machine_type_url = '%s/zones/%s/machineTypes/%s' % (
project_url, DEFAULT_ZONE, DEFAULT_MACHINE_TYPE)
network_url = '%s/global/networks/%s' % (project_url, DEFAULT_NETWORK)
def list_instances():
# List instances
request = gce_service.instances().list(project=PROJECT_ID, filter=None, zone=DEFAULT_ZONE)
response = request.execute(http=auth_http)
if response and 'items' in response:
instances = response['items']
return [instance for instance in instances if instance['name'].startswith('node')]
else:
return []
def up():
# Skip startup if agent is already up.
instances = list_instances()
instance = [instance for instance in instances if instance['name'] == instance_name]
instance = instance[0] if instance else None
if instance and instance['status'] == 'TERMINATED':
if True:
print >> sys.stderr, 'Agent "%s" already exists but is TERMINATED. Starting instance...' % instance_name
request = gce_service.instances().start(
project=PROJECT_ID, instance=instance_name, zone=_get_zone(instance['zone']))
response = request.execute(http=auth_http)
response = _blocking_call(gce_service, auth_http, response)
return
else:
print >> sys.stderr, 'Agent "%s" already exists but is TERMINATED. Deleting instance...' % instance_name
down()
if instance and instance['status'] == 'RUNNING':
sys.exit('Agent "%s" already exists.' % instance_name)
print_instances()
# Construct the request body
instance = {
'name': instance_name,
'machineType': machine_type_url,
'disks': [{
'type': 'PERSISTENT',
'boot': 'true',
'mode': 'READ_WRITE',
'deviceName': instance_name,
'zone': zone_url,
'source': disk_source_url,
'autoDelete': 'false',
}],
'networkInterfaces': [{
'accessConfigs': [{
'type': 'ONE_TO_ONE_NAT',
'name': 'External NAT'
}],
'network': network_url,
}],
'serviceAccounts': [{
'email': DEFAULT_SERVICE_EMAIL,
'scopes': DEFAULT_SCOPES,
}]
}
# Create the instance.
request = gce_service.instances().insert(
project=PROJECT_ID, body=instance, zone=DEFAULT_ZONE)
response = request.execute(http=auth_http)
response = _blocking_call(gce_service, auth_http, response)
print >> sys.stderr, response
def down():
print_instances()
if True:
request = gce_service.instances().stop(
project=PROJECT_ID, zone=DEFAULT_ZONE, instance=instance_name)
else:
request = gce_service.instances().delete(
project=PROJECT_ID, zone=DEFAULT_ZONE, instance=instance_name)
response = request.execute(http=auth_http)
response = _blocking_call(gce_service, auth_http, response)
print >> sys.stderr, response
def print_instances(output=False):
handle = sys.stdout if output else sys.stderr
print >> handle
for instance in list_instances():
print >> handle, instance['name']
print >> handle
if flags.action == 'up':
up()
elif flags.action == 'down':
down()
elif flags.action == 'list':
print_instances(True)
else:
raise Exception('Invalid action: %s' % flags.action)
def _get_zone(url):
return url.split('/')[-1]
def _blocking_call(gce_service, auth_http, response):
"""Blocks until the operation status is done for the given operation."""
status = response['status']
while status != 'DONE' and response:
operation_id = response['name']
# Identify if this is a per-zone resource
if 'zone' in response:
zone_name = _get_zone(response['zone'])
request = gce_service.zoneOperations().get(
project=PROJECT_ID,
operation=operation_id,
zone=zone_name)
else:
request = gce_service.globalOperations().get(
project=PROJECT_ID, operation=operation_id)
response = request.execute(http=auth_http)
if response:
status = response['status']
time.sleep(1)
return response
if __name__ == '__main__':
main(sys.argv)
#!/bin/bash
. ~/jenkins-google-auth/bin/activate
~/bin/agent.py list 2>/dev/null | grep '.' | while read agent; do
#echo
#echo "Checking status of $agent..."
# 1. Check to see if there is such a host in DNS.
# 2. check if we can SSH into the host. If we can, then check the process and maybe shut down.
# This makes sure that we don't consider an SSH failure to be reason to shut down the node.
if ping -c1 $agent 2>&1 |grep unknown >/dev/null; then
echo "node $agent is offline"
else
if ssh $agent echo < /dev/null > /dev/null; then
if PID=$(ssh $agent "pgrep -f '^java.*agent.jar'" < /dev/null || ssh $agent "pgrep -f '^java.*slave.jar'" < /dev/null); then
echo "agent|slave jar is still running on $agent ("$PID"). Leaving things alone..."
else
echo "agent.jar is NOT running on $agent. Shutting down instance..."
~/bin/agent.py down --instance_name=$agent
fi
else
#echo "SSH FAILED TO $agent -- sup with that?"
#echo 'Maybe the instance is TERMINATED instead of deleted? It should be deleted.'
echo "Maybe instance $agent is offline?"
fi
fi
done
# Each jenkins node configured to "Launch slave via execution of command on the Master" and then to "Launch command":
/var/lib/jenkins/bin/start-agent-and-connect slave-1
/var/lib/jenkins/bin/start-agent-and-connect slave-2
etc.
#!/bin/bash
set -x
. ~/jenkins-google-auth/bin/activate
echo "Starting agent and connecting..."
~/bin/agent.py up --instance_name=$1
echo 'Connecting...'
# SSH into the agent, grab the latest agent.jar from the master, and run it.
AGENT_JAR=~/tmp/agent.jar.$$
curl http://localhost:8080/jnlpJars/agent.jar > $AGENT_JAR &&
scp $AGENT_JAR $1:agent.jar &&
rm $AGENT_JAR &&
ssh $1 "
exec java -Xms1024m -Djava.awt.headless=true -jar ~/agent.jar
"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment