Skip to content

Instantly share code, notes, and snippets.

@jlinoff
Last active August 7, 2021 19:33
Show Gist options
  • Save jlinoff/847089bab1ed749533f1d9e2e17e09a1 to your computer and use it in GitHub Desktop.
Save jlinoff/847089bab1ed749533f1d9e2e17e09a1 to your computer and use it in GitHub Desktop.
jenkins tools
#!/usr/bin/env python3
# pylint: disable=line-too-long
r'''
Extract the Jenkins build data into a JSON file for insertion into a
database where it can be used for graphing in grafana.
The following environment variables control the behavior of this program.
JFN Name of the text file to write all of the job names to.
This is mainly used for debugging.
The default is not to write the job names.
Example: JFN=/tmp/alljobs.txt
EFN Name of the error file.
This JSON file contains the errors that were caught.
Default: 'errs.json'
DAYS The number of days (last N days) to process. The default is 30.
BEG Begin date in ISO 8601 format. Default is DAYS days ago.
END End date in ISO 8601 format. Default is now.
MAX Maximum number of builds per job. Useful for cases
where there are jobs thousands of builds. The default is 0 (all).
MAX=1 retrieves the latest build for each branch.
choose MAX=1 DAYS=1825 to get the latest build searching back 5 years.
choose MAX=1 DAYS=3650 to get the latest build searching back 10 years.
NOWARN NOWARN=1 turns off warnings.
TIMEOUT Sets the timeout in seconds. The default is 300.
This will vary considerably for each site.
I typically use 7200 for my site.
PREFIX The top level prefix regex for job names. The default is '^eng'.
This is also site specific.
VERBOSE VERBOSE=1 turns on verbose mode, status messages are written
to stderr. VERBOSE=2 turns on very verbose mode.
Args:
credentials_file The JSON file containing the credentials. A dictionary with three fields:
username, password, url.
output_file THe output JSON file containing the data.
Usage:
$ time VERBOSE=1 TIMEOUT=7200 ./extract.py creds.json eng-all.json # very long time
$ time VERBOSE=1 TIMEOUT=7200 MAX=1000 ./extract.py creds.json eng-all-1000.json
$ time VERBOSE=2 DAYS=60 TIMEOUT=7200 MAX=1000 ./extract.py creds.json extract-60.json
$ time VERBOSE=2 DAYS=7 TIMEOUT=7200 ./extract.py creds.json eng-7.json
$ time VERBOSE=2 PREFIX='^eng/platform' DAYS=7 TIMEOUT=7200 ./extract.py creds.json eng.json
$ time VERBOSE=1 PREFIX='^eng/platform' DAYS=7 TIMEOUT=7200 ./extract.py jcreds.json eng.json
$ time VERBOSE=1 PREFIX='^eng/platform-pipe' DAYS=30 TIMEOUT=7200 ./extract.py jcreds.json eng.json
$ time VERBOSE=1 PREFIX='^eng' DAYS=30 TIMEOUT=7200 ./extract.py jcreds.json eng-30.json
$ time VERBOSE=1 PREFIX='^eng' DAYS=60 TIMEOUT=7200 ./extract.py jcreds.json eng-60.json
$ time VERBOSE=1 PREFIX='^eng' DAYS=90 TIMEOUT=7200 ./extract.py jcreds.json eng-90.json
$ # Get the latest builds for all branches and all jobs.
$ time VERBOSE=1 PREFIX='^eng' NOWARN=1 DAYS=3650 MAX=1 TIMEOUT=7200 ./extract.py jcreds.json eng-latest.json
The output file is a JSON file with an array of build entries,
each with the following fields:
1. 'status' Status string: SUCCESS, FAILURE, ABORTED, etc.
3. 'timestamp' Timestamp in ISO 8601 format.
4. 'duration' Duration seconds.
5. 'running' Is the build runing flag.
6. 'url' The build URL.
7. 'project' The build project.
8. 'branch' The build branch.
9. 'number' The build number.
It can be further processed to insert the data into a database table.
'''
# pylint: enable=line-too-long
import datetime
import inspect
import json
import os
import re
import sys
from getpass import getpass
from typing import Tuple, TextIO, List
from jenkinsapi.jenkins import Jenkins # pylint: disable=import-error
import requests
import dateutil.parser # pylint: disable=import-error
# Module variables.
VERBOSE = int(os.getenv('VERBOSE', '0'))
JFN = os.getenv('JFN', '')
EFN = os.getenv('EFN', 'errs.json')
MAX = int(os.getenv('MAX', '0'))
NOW = datetime.datetime.utcnow()
DAYS = int(os.getenv('DAYS', '30'))
DAYS_AGO = datetime.timedelta(days=DAYS)
BEG = os.getenv('BEG', (NOW - DAYS_AGO).isoformat(timespec='seconds') + '+00:00')
END = os.getenv('END', NOW.isoformat(timespec='seconds') + '+00:00')
BEG_DATE = dateutil.parser.parse(BEG) # pylint: disable=no-member
END_DATE = dateutil.parser.parse(END) # pylint: disable=no-member
PREFIX = os.getenv('PREFIX', '^eng')
TIMEOUT = float(os.getenv('TIMEOUT', '300'))
NOWARN = bool(int(os.getenv('NOWARN', '0')))
def info(msg: str, ofp: TextIO = sys.stderr, end='\n', level=1):
'''Print a verbose info message.
Args:
msg: The message
ofp: The output file.
end: The message terminator.
level: The level of the caller in the stack.
'''
lnum = inspect.stack()[level].lineno
print(f'INFO:{lnum}: {msg}', file=ofp, end=end)
def warn(msg: str, ofp: TextIO = sys.stderr, end='\n', level=1):
'''Print a verbose warning message.
Args:
msg: The message
ofp: The output file.
end: The message terminator.
level: The level of the caller in the stack.
'''
if not NOWARN:
print('\x1b[35m', end='', file=ofp)
lnum = inspect.stack()[level].lineno
print(f'WARNING:{lnum}: {msg}', file=ofp, end=end)
print('\x1b[0m', end='', file=ofp)
def err(msg: str, ofp: TextIO = sys.stderr, end='\n', level=1):
'''Error message.
Args:
msg: The message
ofp: The output file.
end: The message terminator.
level: The level of the caller in the stack.
'''
lnum = inspect.stack()[level].lineno
print('\x1b[31m', end='', file=ofp)
print(f'ERROR:{lnum}: {msg}', file=ofp, end=end)
print('\x1b[0m', end='', file=ofp)
def get_jenkins_creds(fname: str) -> Tuple[str,str,str]:
'''
Load the jenkins credentials into a dict object
for use with get_jenkins_server function.
There are three keys: username, password and url.
Behind a firewall the credentials password is normally a token.
That is obtained by creating a global credentials user in jenkins.
If any of the fields are missing, prompt for them.
Returns:
creds: dictionary of the jenkins credentials
'''
creds = {
'username': '',
'password': '',
'url': '',
}
with open(fname) as ifp:
string = ifp.read().strip()
try:
creds = json.loads(string)
except json.decoder.JSONDecodeError as exc:
err(f'failed to decode credentials file: {fname}: {exc}')
# Prompt for missing fields.
for key in ['username', 'password', 'url']:
if key in creds and not creds[key]:
if key == 'password':
creds[key] = getpass(f'{key.capitalize()}? ')
else:
creds[key] = input(f'{key.capitalize()}? ')
# Verify no bad input.
for key in ['username', 'password', 'url']:
if key in creds and not creds[key]:
err(f'missing value for "{key}" in {fname}')
sys.exit(1)
return creds
def get_jenkins_server(creds: dict) -> Jenkins:
'''
Get the jenkins server object.
It is used for all subsequent interface calls.
The object construction loads ALL of the jobs when not in lazy mode
which can take a few minutes.
Args:
creds: The login credentials in a dict with username, password and url.
Returns:
server: The jenkins server object.
'''
url = creds['url']
if VERBOSE:
info(f'loading jenkins server data from {url}')
start = datetime.datetime.now()
server = Jenkins(username=creds['username'],
password=creds['password'],
baseurl=url,
timeout=TIMEOUT,
lazy=True)
if VERBOSE:
elapsed = datetime.datetime.now() - start
info(f'loaded jenkins server in {elapsed.total_seconds()} seconds')
return server
def get_server_jobs_names(server: Jenkins) -> List[str]:
'''Get the jenkins job names.
Args:
server: The jenkins server object.
Returns:
list: The list of job names.
'''
if VERBOSE:
info('loading server job names')
start = datetime.datetime.now()
job_names = server.keys()
if JFN:
if VERBOSE:
info(f'writing {len(job_names)} job names to "{JFN}"')
with open(JFN, 'w') as ofp:
for job_name in job_names:
ofp.write(job_name + '\n')
if VERBOSE:
elapsed = datetime.datetime.now() - start
info(f'loaded {len(job_names)} jobs names in {elapsed.total_seconds()} seconds')
return job_names
def write_failed_errors(failed: list):
'''Save the failures for later debugging.
Args:
failed: The list of failures.
'''
if failed and EFN:
err(f'{len(failed)} processing failures written to "{EFN}"\n')
with open(EFN, 'w') as efp:
efp.write(json.dumps(failed, indent=4) + '\n')
def process_jobs(server: Jenkins, ofp: TextIO): # pylint: disable=too-many-locals,too-many-branches, too-many-statements
'''Process jobs.
Args:
server: The jenkins server object.
ofp: The output file pointer.
'''
start = datetime.datetime.now()
prefix = re.compile(PREFIX)
job_names = get_server_jobs_names(server)
# don't user server.get_jobs() in the for-loop because it would
# retrieve job info objects for all jobs even the filtered ones
# which would affect performance in cases where a subset of jobs
# is desired.
failed = []
recs = {
'meta': {
'date': datetime.datetime.now().isoformat(timespec='seconds'),
'days': DAYS,
'beg': BEG,
'end': END,
'timeout': TIMEOUT,
'prefix': PREFIX,
'max': MAX,
'efn': EFN,
'jfn': JFN,
'time': 0,
'errors': [],
},
'builds': []
}
njobs = len(job_names)
for i, job_name in enumerate(job_names, start=1):
if not prefix.search(job_name):
if VERBOSE > 1:
warn(f"skipping job name that doesn't match '{PREFIX}': '{job_name}' {i} of {njobs}")
continue
if VERBOSE:
jrem = njobs - i
info(f'\x1b[1mJOB: {job_name} {i} of {njobs} ({jrem})\x1b[0m')
_top, project, branch = job_name.split('/')
# get the job information.
try:
job_instance = server.get_job(job_name)
except requests.exceptions.HTTPError as exc:
err(f'request timed out for server.get_job(job_name): {exc}')
failed.append({'type': 'job', 'name': job_name, 'exc': str(exc)})
continue
# get the build_ids
try:
build_ids = list(job_instance.get_build_ids())
if MAX:
ntrunc = len(build_ids) - MAX if len(build_ids) > MAX else 0
if ntrunc > 0:
warn(f"MAX={MAX} skipping {ntrunc} jobs out of {len(build_ids)}")
build_ids = build_ids[:MAX] # truncate
except requests.exceptions.HTTPError as exc:
err(f'request timed out for job_instance.get_build_ids(): {exc}')
failed.append({'type': 'job', 'name': job_name, 'exc': str(exc)})
continue
nbuilds = len(build_ids)
for j, build_id in enumerate(build_ids, start=1):
# get the build information
if VERBOSE:
rem = nbuilds - j
info(f'BUILD: {len(recs)+1} {job_name}/{build_id} {j} of {nbuilds} ({rem})')
try:
build = job_instance.get_build(build_id)
except requests.exceptions.HTTPError as exc:
err(f'request timed out for job_instance.get_build(build_id): {exc}')
failed.append({'type': 'job', 'name': job_name, 'build_id': build_id, 'exc': exc})
continue
if VERBOSE:
info(f'STATUS: {build.get_status()} {build.get_build_url()}')
dts = build.get_timestamp().isoformat(timespec='seconds')
status = build.get_status() if build.get_status() else 'RUNNING'
rec = {
'project': project,
'branch': branch,
'number': build_id,
'running': build.is_running(),
'status': status,
'duration': build.get_duration().total_seconds(),
'timestamp': dts,
'url': build.get_build_url()
}
if VERBOSE > 1:
info(f'DTS: {dts} {BEG} {END}')
info(json.dumps(rec, indent=4))
if dts < BEG:
if VERBOSE:
warn(f'too old - skipping {rem} builds earlier than BEG for this job: '
f'{dts} < {BEG}')
break # all subsequent builds are earlier
if dts > END:
if VERBOSE:
warn(f'too new - skipping builds later than END for this job: {dts} > {END}')
continue # do not break!! need to get the earlier dates
recs['builds'].append(rec)
ofp.write(json.dumps(recs))
elapsed = datetime.datetime.now() - start
recs['meta']['time'] = elapsed.total_seconds()
recs['meta']['errors'] = failed
if VERBOSE:
info(f'extracted {len(recs["builds"])} build records in {elapsed.total_seconds()} seconds')
write_failed_errors(failed)
def main():
'''main entry point
'''
cfile = ''
ofp = sys.stdout
if len(sys.argv) > 1:
cfile = sys.argv[1]
if len(sys.argv) > 2:
ofp = open(sys.argv[2], 'w') # pylint: disable=consider-using-with
if VERBOSE:
ofn = sys.argv[2] if len(sys.argv) > 2 else 'sys.stdout'
info('Parameters')
info(f' DAYS : {DAYS}')
info(f' BEG : {BEG}')
info(f' END : {END}')
info(f' MAX : {MAX}')
info(f' PREFIX : "{PREFIX}"')
info(f' TIMEOUT : {TIMEOUT}')
info(f' VERBOSE : {VERBOSE}')
info(f' EFN : {EFN}')
info(f' JFN : {JFN}')
info(f' ofn : "{ofn}"')
creds = get_jenkins_creds(cfile)
if VERBOSE:
info(f' username : {creds["username"]}')
info(f' url : {creds["url"]}')
server = get_jenkins_server(creds)
process_jobs(server, ofp)
if ofp != sys.stdout:
ofp.close()
if __name__ == '__main__':
main()
#!/usr/bin/env python3
'''
Convert extract format files into SQL.
Typical usage:
$ time /convert.py extract.json builds builds.sql
$ # ^ ^ ^
$ # | | +---- output file name
$ # | +----------- SQL table name
$ # +------------------------ input file name
'''
import json
import sys
from typing import TextIO
def tdef(tname: str, meta: str, ofp: TextIO=sys.stdout):
'''Output the table definition
Args:
tname: The table name
meta: The meta table name
ofp: The output file pointer
'''
ofp.write(f'''
DROP TABLE IF EXISTS {tname} CASCADE;
CREATE TABLE IF NOT EXISTS {tname} (
id SERIAL PRIMARY KEY,
status TEXT NOT NULL,
time TIMESTAMPTZ NOT NULL,
duration DECIMAL NOT NULL,
project TEXT NOT NULL,
branch TEXT NOT NULL,
number DECIMAL NOT NULL,
url TEXT NOT NULL
);
COMMENT ON TABLE {tname} IS 'Jenkins build data';
COMMENT ON COLUMN {tname}.status IS 'Result like SUCCESS or FAILURE';
COMMENT ON COLUMN {tname}.time IS 'Build completed timestamp';
COMMENT ON COLUMN {tname}.duration IS 'Build duration in seconds';
COMMENT ON COLUMN {tname}.project IS 'Build project name';
COMMENT ON COLUMN {tname}.branch IS 'Build branch name: "PR-1"';
COMMENT ON COLUMN {tname}.number IS 'Build number';
COMMENT ON COLUMN {tname}.url IS 'Build URL';
DROP TABLE IF EXISTS {meta} CASCADE;
CREATE TABLE IF NOT EXISTS {meta} (
id SERIAL PRIMARY KEY,
key TEXT NOT NULL,
value TEXT NOT NULL
);
COMMENT ON TABLE {meta} IS 'Jenkins build meta data';
COMMENT ON COLUMN {meta}.key IS 'Key name';
COMMENT ON COLUMN {meta}.value IS 'Key value';
''')
def populate_meta(meta: str, recs: dict, ofp: TextIO=sys.stdout):
'''Populate the meta table.
Args:
meta: The meta table name
recs: The extracr JSON data
ofp: The output file pointer
'''
ofp.write(f'''
INSERT INTO {meta} (key, value)
VALUES
''')
for i, key in enumerate(recs['meta']):
if i:
ofp.write(',\n')
value = recs['meta'][key]
ofp.write(f'''\
('{key}', '{value}')\
''')
ofp.write(';\n')
def populate_builds(tname: str, recs: dict, ofp: TextIO=sys.stdout):
'''Populate the builds data
Args:
tname: The table name
recs: The extracr JSON data
ofp: The output file pointer
'''
ofp.write(f'''
INSERT INTO {tname} (status, time, duration, project, branch, number, url)
VALUES
''')
for i, rec in enumerate(recs['builds']):
status = rec['status']
timestamp = rec['timestamp']
duration = int(rec['duration'])
project = rec['project']
branch = rec['branch']
number = rec['number']
url = rec['url']
if not timestamp.endswith('+00:00'):
timestamp += '+00:00'
if i:
ofp.write(',\n')
ofp.write(f'''\
('{status}', '{timestamp}', {duration}, '{project}', '{branch}', '{number}', '{url}')\
''')
ofp.write(';\n')
def main():
'''main
'''
ifn = sys.argv[1] # input file name
tname = sys.argv[2] # table name
ofn = sys.argv[3] # output file name
meta = f'{tname}_meta'
with open(ifn) as ifp:
recs = json.loads(ifp.read())
with open(ofn, 'w') as ofp:
tdef(tname, meta, ofp)
populate_meta(meta, recs, ofp)
populate_builds(tname, recs, ofp)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment