Last active
August 11, 2023 18:47
-
-
Save yekver/34c9d41c1c4ea478151574ea539e9953 to your computer and use it in GitHub Desktop.
Instead of `cluster` module there is no direct access to the master process in `pm2`. To return metrics for the whole cluster you can do IPC calls from the active instance to the rest of them and wait while all their locally collected metrics will be sent. Finally you have to aggregate all received metrics.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const prom = require('prom-client'); | |
const pm2 = require('pm2'); | |
let pm2Bus; | |
const REQ_TOPIC = 'get_prom_register'; | |
function pm2exec(cmd, ...args) { | |
return new Promise((resolve, reject) => { | |
pm2[cmd](...args, (err, resp) => (err ? reject(err) : resolve(resp))); | |
}); | |
} | |
function getOnlineInstances(instancesData) { | |
return instancesData.filter(({ pm2_env }) => pm2_env.status === 'online'); | |
} | |
function getMainMetricsRegister(instancesData) { | |
// don't use prom.register here because these metrics | |
// will be summed in cluster mode! | |
const registry = new prom.Registry(); | |
const mainMetrics = [ | |
{ name: 'up', help: 'Is the process running' }, | |
{ name: 'cpu', help: 'Process cpu usage' }, | |
{ name: 'memory', help: 'Process memory usage' }, | |
{ name: 'heap_size', help: 'Process heap size' }, | |
{ name: 'used_heap_size', help: 'Process heap usage' }, | |
{ name: 'uptime', help: 'Process uptime' }, | |
{ name: 'instances', help: 'Process instances' }, | |
{ name: 'restarts', help: 'Process restarts' }, | |
{ name: 'loop_delay', help: 'Event Loop Latency' }, | |
{ name: 'loop_delay_p95', help: 'Event Loop Latency p95' }, | |
].reduce((acc, { name, help }) => { | |
acc[name] = new prom.Gauge({ | |
name, | |
help, | |
labelNames: ['name', 'instance'], | |
registers: [registry], | |
}); | |
return acc; | |
}, {}); | |
instancesData.forEach(({ name, pm2_env, monit }) => { | |
const conf = { | |
name: name, | |
instance: pm2_env.pm_id, | |
}; | |
const axm = pm2_env.axm_monitor; | |
const values = { | |
up: pm2_env.status === 'online' ? 1 : 0, | |
cpu: monit.cpu, | |
memory: monit.memory, | |
heap_size: parseFloat(axm['Heap Size'].value) || null, | |
used_heap_size: parseFloat(axm['Used Heap Size'].value) || null, | |
uptime: Math.round((Date.now() - pm2_env.pm_uptime) / 1000), | |
instances: pm2_env.instances || 1, | |
restarts: pm2_env.unstable_restarts + pm2_env.restart_time, | |
loop_delay: parseFloat(axm['Event Loop Latency'].value) || null, | |
loop_delay_p95: parseFloat(axm['Event Loop Latency p95'].value) || null, | |
}; | |
Object.entries(values).forEach(([name, value]) => { | |
if (value !== null) { | |
mainMetrics[name].set(conf, value); | |
} | |
}); | |
}); | |
return registry; | |
} | |
function requestNeighboursData(instancesData, instancesToWait) { | |
const targetInstanceId = Number(process.env.pm_id); | |
const data = { topic: REQ_TOPIC, data: { targetInstanceId } }; | |
Object.values(instancesData).forEach(({ pm_id }) => { | |
if (pm_id !== targetInstanceId) { | |
pm2exec('sendDataToProcessId', pm_id, data).catch(e => { | |
instancesToWait.count--; | |
console.error(`Failed to request metrics from instance #${pm_id}: ${e.message}`); | |
}); | |
} | |
}); | |
} | |
function getCurrentRegistry(instancesData) { | |
return prom.AggregatorRegistry.aggregate([ | |
getMainMetricsRegister(instancesData).getMetricsAsJSON(), | |
prom.register.getMetricsAsJSON(), | |
]); | |
} | |
async function getAggregatedRegistry(instancesData) { | |
const onlineInstances = getOnlineInstances(instancesData); | |
let instancesToWait = { count: onlineInstances.length }; | |
const registryPromise = new Promise(async (resolve, reject) => { | |
const registersList = []; | |
const instanceId = Number(process.env.pm_id); | |
const eventName = `process:${instanceId}`; | |
let responcesCount = 1; | |
let timeoutId; | |
function sendResult() { | |
pm2Bus.off(eventName); | |
resolve(prom.AggregatorRegistry.aggregate(registersList)); | |
} | |
function kickNoResponseTimeout() { | |
timeoutId = setTimeout(() => { | |
console.warn( | |
`Metrics were sent by timeout. No response from ${instancesToWait.count - | |
responcesCount} instances.`, | |
); | |
sendResult(); | |
}, 1000); | |
} | |
try { | |
registersList[instanceId] = getCurrentRegistry( | |
instancesData, | |
).getMetricsAsJSON(); | |
if (!pm2Bus) { | |
pm2Bus = await pm2exec('launchBus'); | |
} | |
kickNoResponseTimeout(); | |
pm2Bus.on(eventName, packet => { | |
registersList[packet.data.instanceId] = packet.data.register; | |
responcesCount++; | |
clearTimeout(timeoutId); | |
if (responcesCount === instancesToWait.count) { | |
sendResult(); | |
} else { | |
kickNoResponseTimeout(); | |
} | |
}); | |
} catch (e) { | |
reject(e); | |
} | |
}); | |
// this function must be called after the registryPromise declaration | |
// because requests have to be sent after the listener was setup. | |
requestNeighboursData(onlineInstances, instancesToWait); | |
return registryPromise; | |
} | |
// Listener | |
process.on('message', packet => { | |
if (packet.topic === REQ_TOPIC) { | |
process.send({ | |
type: `process:${packet.data.targetInstanceId}`, | |
data: { | |
instanceId: Number(process.env.pm_id), | |
register: prom.register.getMetricsAsJSON(), | |
}, | |
}); | |
} | |
}); | |
(async () => await pm2exec('connect'))(); | |
module.exports = async (req, res) => { | |
let responceData; | |
try { | |
const instancesData = await pm2exec('list'); | |
const register = | |
getOnlineInstances(instancesData).length > 1 | |
? await getAggregatedRegistry(instancesData) | |
: getCurrentRegistry(instancesData); | |
responceData = register.metrics(); | |
} catch (err) { | |
console.error(`Failed to get metrics: ${err.message}`); | |
} finally { | |
res.set('Content-Type', prom.register.contentType); | |
res.end(responceData); | |
} | |
}; |
Big thanks for this gist. Currently API was changed, but main logic works
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@FidelisChimombe if you are trying to implement prometheus to monitor APIs (like response time and so on) we ended up using https://github.com/nginxinc/nginx-prometheus-exporter since we use Nginx as a reverse proxy for our Node.js pm2 server. So if you use Nginx too you can use that exporter, it is a stable package.