Skip to content

Instantly share code, notes, and snippets.

@pandeybk
Last active April 23, 2024 19:52
Show Gist options
  • Save pandeybk/eafcdbcb92cec30581268094f64e591a to your computer and use it in GitHub Desktop.
Save pandeybk/eafcdbcb92cec30581268094f64e591a to your computer and use it in GitHub Desktop.
dcgm-dashboard.json
{
"annotations": {
"list": [
{
"$$hashKey": "object:192",
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.19+) cluster",
"editable": true,
"fiscalYearStartMonth": 0,
"gnetId": 12239,
"graphTooltip": 0,
"id": 2,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "kwiMtNfSk"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": "auto",
"displayMode": "auto",
"inspect": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "device"
},
"properties": [
{
"id": "custom.width",
"value": 115
}
]
},
{
"matcher": {
"id": "byName",
"options": "gpu"
},
"properties": [
{
"id": "custom.width",
"value": 78
}
]
},
{
"matcher": {
"id": "byName",
"options": "exported_container"
},
"properties": [
{
"id": "custom.width",
"value": 128
}
]
},
{
"matcher": {
"id": "byName",
"options": "exported_pod"
},
"properties": [
{
"id": "custom.width",
"value": 122
}
]
},
{
"matcher": {
"id": "byName",
"options": "Namespace"
},
"properties": [
{
"id": "custom.width",
"value": 158
}
]
},
{
"matcher": {
"id": "byName",
"options": "Container"
},
"properties": [
{
"id": "custom.width",
"value": 149
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 18,
"x": 0,
"y": 0
},
"id": 26,
"options": {
"footer": {
"fields": "",
"reducer": [
"sum"
],
"show": false
},
"showHeader": true,
"sortBy": []
},
"pluginVersion": "9.1.6",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "kwiMtNfSk"
},
"editorMode": "code",
"expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"$instance\", gpu=~\"$gpu\"}",
"format": "table",
"legendFormat": "__auto",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "kwiMtNfSk"
},
"editorMode": "code",
"expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\".*\"}",
"format": "table",
"hide": false,
"legendFormat": "__auto",
"range": true,
"refId": "B"
}
],
"title": "GPU Utilization/ Node/ GPU",
"transformations": [
{
"id": "seriesToColumns",
"options": {
"byField": "instance"
}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Hostname 2": true,
"Time 1": true,
"Time 2": true,
"UUID 1": true,
"UUID 2": true,
"Value #B": true,
"__name__ 1": true,
"__name__ 2": true,
"container 1": true,
"container 2": true,
"device 2": true,
"endpoint 1": true,
"endpoint 2": true,
"exported_container 2": true,
"exported_namespace 2": true,
"exported_pod 2": true,
"gpu 2": true,
"instance": true,
"job 1": true,
"job 2": true,
"modelName 1": true,
"modelName 2": true,
"namespace 1": true,
"namespace 2": true,
"pod 1": true,
"pod 2": true,
"service 1": true,
"service 2": true
},
"indexByName": {
"Hostname 1": 1,
"Hostname 2": 19,
"Time 1": 3,
"Time 2": 18,
"UUID 1": 4,
"UUID 2": 20,
"Value #A": 17,
"Value #B": 34,
"__name__ 1": 5,
"__name__ 2": 21,
"container 1": 6,
"container 2": 22,
"device 1": 7,
"device 2": 23,
"endpoint 1": 9,
"endpoint 2": 24,
"exported_container 1": 12,
"exported_container 2": 25,
"exported_namespace 1": 10,
"exported_namespace 2": 26,
"exported_pod 1": 11,
"exported_pod 2": 27,
"gpu 1": 8,
"gpu 2": 28,
"instance": 0,
"job 1": 13,
"job 2": 29,
"modelName 1": 2,
"modelName 2": 30,
"namespace 1": 14,
"namespace 2": 31,
"pod 1": 15,
"pod 2": 32,
"service 1": 16,
"service 2": 33
},
"renameByName": {
"Value #A": "GPU Utilization",
"container 1": "",
"exported_container 1": "Container",
"exported_namespace 1": "Namespace",
"exported_pod 1": "Pod",
"gpu 1": ""
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "#EAB839",
"value": 83
},
{
"color": "red",
"value": 87
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 0
},
"id": 14,
"options": {
"orientation": "auto",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "9.1.6",
"targets": [
{
"datasource": {
"uid": "$datasource"
},
"editorMode": "code",
"expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\"})",
"interval": "",
"legendFormat": "",
"range": true,
"refId": "A"
}
],
"title": "GPU Avg. Temp",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "kwiMtNfSk"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": "auto",
"displayMode": "auto",
"inspect": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "{Hostname=\"worker1.cloud9c.xtoph156.dfw.ocp.run\", UUID=\"GPU-0d986ca6-0fdf-cd1a-7022-e89ad0d55f61\", __name__=\"DCGM_FI_DEV_GPU_UTIL\", container=\"nvidia-dcgm-exporter\", device=\"nvidia0\", endpoint=\"gpu-metrics\", exported_container=\"llama2\", exported_namespace=\"finetune\", exported_pod=\"llama2-0\", gpu=\"0\", instance=\"192.168.156.136:9400\", job=\"nvidia-dcgm-exporter\", modelName=\"Tesla V100-PCIE-16GB\", namespace=\"nvidia-gpu-operator\", pod=\"nvidia-dcgm-exporter-2s2p8\", service=\"nvidia-dcgm-exporter\"}"
},
"properties": [
{
"id": "custom.width",
"value": 299
}
]
},
{
"matcher": {
"id": "byName",
"options": "__name__ 2"
},
"properties": [
{
"id": "custom.width",
"value": 231
}
]
},
{
"matcher": {
"id": "byName",
"options": "container 1"
},
"properties": [
{
"id": "custom.width",
"value": 226
}
]
},
{
"matcher": {
"id": "byName",
"options": "job 1"
},
"properties": [
{
"id": "custom.width",
"value": 208
}
]
},
{
"matcher": {
"id": "byName",
"options": "modelName"
},
"properties": [
{
"id": "custom.width",
"value": 164
}
]
},
{
"matcher": {
"id": "byName",
"options": "exported_namespace"
},
"properties": [
{
"id": "custom.width",
"value": 151
}
]
},
{
"matcher": {
"id": "byName",
"options": "instance"
},
"properties": [
{
"id": "custom.width",
"value": 112
}
]
},
{
"matcher": {
"id": "byName",
"options": "Hostname"
},
"properties": [
{
"id": "custom.width",
"value": 200
}
]
},
{
"matcher": {
"id": "byName",
"options": "gpu"
},
"properties": [
{
"id": "custom.width",
"value": 72
}
]
},
{
"matcher": {
"id": "byName",
"options": "GPU Util"
},
"properties": [
{
"id": "custom.width",
"value": 57
}
]
},
{
"matcher": {
"id": "byName",
"options": "device"
},
"properties": [
{
"id": "custom.width",
"value": 101
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 18,
"x": 0,
"y": 8
},
"id": 22,
"options": {
"footer": {
"fields": "",
"reducer": [
"sum"
],
"show": false
},
"showHeader": true,
"sortBy": []
},
"pluginVersion": "9.1.6",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "kwiMtNfSk"
},
"editorMode": "code",
"exemplar": false,
"expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"$instance\"}",
"format": "table",
"instant": false,
"interval": "",
"legendFormat": "__auto",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "kwiMtNfSk"
},
"editorMode": "code",
"expr": "DCGM_FI_DEV_COUNT{instance=~\"$instance\"}",
"format": "table",
"hide": false,
"legendFormat": "__auto",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "kwiMtNfSk"
},
"editorMode": "code",
"expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\"}",
"format": "table",
"hide": false,
"legendFormat": "__auto",
"range": true,
"refId": "C"
}
],
"title": "Processes/ Node",
"transformations": [
{
"id": "seriesToColumns",
"options": {
"byField": "instance"
}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Hostname 2": true,
"Hostname 3": true,
"Time 1": true,
"Time 2": true,
"Time 3": true,
"UUID 1": true,
"UUID 2": true,
"UUID 3": true,
"Value #A": false,
"__name__ 1": true,
"__name__ 2": true,
"__name__ 3": true,
"container 1": true,
"container 2": true,
"container 3": true,
"device 2": true,
"device 3": true,
"endpoint 1": true,
"endpoint 2": true,
"endpoint 3": true,
"exported_container 2": true,
"exported_container 3": true,
"exported_namespace 2": true,
"exported_namespace 3": true,
"exported_pod 1": false,
"exported_pod 2": true,
"exported_pod 3": true,
"gpu 1": false,
"gpu 2": true,
"gpu 3": true,
"job 1": true,
"job 2": true,
"job 3": true,
"modelName 2": true,
"modelName 3": true,
"namespace 1": true,
"namespace 2": true,
"namespace 3": true,
"pod 1": true,
"pod 2": true,
"pod 3": true,
"service 1": true,
"service 2": true,
"service 3": true
},
"indexByName": {
"Hostname 1": 2,
"Hostname 2": 19,
"Time 1": 1,
"Time 2": 18,
"UUID 1": 3,
"UUID 2": 20,
"Value #A": 17,
"Value #B": 34,
"__name__ 1": 4,
"__name__ 2": 21,
"container 1": 5,
"container 2": 22,
"device 1": 6,
"device 2": 23,
"endpoint 1": 7,
"endpoint 2": 24,
"exported_container 1": 8,
"exported_container 2": 25,
"exported_namespace 1": 9,
"exported_namespace 2": 26,
"exported_pod 1": 11,
"exported_pod 2": 27,
"gpu 1": 12,
"gpu 2": 28,
"instance": 0,
"job 1": 13,
"job 2": 29,
"modelName 1": 10,
"modelName 2": 30,
"namespace 1": 14,
"namespace 2": 31,
"pod 1": 15,
"pod 2": 32,
"service 1": 16,
"service 2": 33
},
"renameByName": {
"Value #A": "GPU Util",
"Value #B": "GPU Count",
"Value #C": "Temperature"
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 8
},
"id": 20,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.1.6",
"targets": [
{
"datasource": {
"uid": "$datasource"
},
"editorMode": "code",
"expr": "count(DCGM_FI_DEV_GPU_UTIL{instance=~\"$instance\"})",
"format": "time_series",
"range": true,
"refId": "A"
}
],
"title": "Total Number of GPUs",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "kwiMtNfSk"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": "auto",
"displayMode": "auto",
"inspect": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "__name__"
},
"properties": [
{
"id": "custom.width",
"value": 280
}
]
},
{
"matcher": {
"id": "byName",
"options": "modelName"
},
"properties": [
{
"id": "custom.width",
"value": 187
}
]
},
{
"matcher": {
"id": "byName",
"options": "instance"
},
"properties": [
{
"id": "custom.width",
"value": 172
}
]
},
{
"matcher": {
"id": "byName",
"options": "Hostname"
},
"properties": [
{
"id": "custom.width",
"value": 293
}
]
},
{
"matcher": {
"id": "byName",
"options": "device"
},
"properties": [
{
"id": "custom.width",
"value": 104
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 18,
"x": 0,
"y": 16
},
"id": 24,
"options": {
"footer": {
"fields": "",
"reducer": [
"sum"
],
"show": false
},
"showHeader": true,
"sortBy": []
},
"pluginVersion": "9.1.6",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "kwiMtNfSk"
},
"editorMode": "code",
"expr": "count by (instance) (DCGM_FI_DEV_GPU_UTIL{instance=~\"$instance\"})",
"format": "table",
"legendFormat": "__auto",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "kwiMtNfSk"
},
"editorMode": "code",
"expr": "DCGM_FI_DEV_GPU_UTIL",
"format": "table",
"hide": false,
"legendFormat": "__auto",
"range": true,
"refId": "B"
}
],
"title": "Number of GPUs/ Node",
"transformations": [
{
"id": "seriesToColumns",
"options": {
"byField": "instance"
}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Time 1": true,
"Time 2": true,
"UUID": true,
"Value #B": true,
"__name__": true,
"container": true,
"device": true,
"endpoint": true,
"exported_container": true,
"exported_namespace": true,
"exported_pod": true,
"gpu": true,
"job": true,
"namespace": true,
"pod": true,
"service": true
},
"indexByName": {
"Hostname": 0,
"Time 1": 3,
"Time 2": 4,
"UUID": 2,
"Value #A": 19,
"Value #B": 18,
"__name__": 5,
"container": 6,
"device": 7,
"endpoint": 9,
"exported_container": 10,
"exported_namespace": 11,
"exported_pod": 12,
"gpu": 13,
"instance": 1,
"job": 14,
"modelName": 8,
"namespace": 15,
"pod": 16,
"service": 17
},
"renameByName": {
"Value #A": "Total GPUs",
"instance": ""
}
}
}
],
"type": "table"
},
{
"datasource": {
"uid": "$datasource"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 2400,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "#EAB839",
"value": 1800
},
{
"color": "red",
"value": 2200
}
]
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 16
},
"id": 16,
"links": [],
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"sum"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "9.1.6",
"targets": [
{
"datasource": {
"uid": "$datasource"
},
"expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"$instance\", gpu=~\"$gpu\"})",
"instant": true,
"interval": "",
"legendFormat": "",
"range": false,
"refId": "A"
}
],
"title": "GPU Power Total",
"type": "gauge"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": {
"uid": "$datasource"
},
"fieldConfig": {
"defaults": {
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 11,
"x": 0,
"y": 24
},
"hiddenSeries": false,
"id": 12,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "9.1.6",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"datasource": {
"uid": "$datasource"
},
"expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\"}",
"instant": false,
"interval": "",
"legendFormat": "GPU {{gpu}}",
"refId": "A"
}
],
"thresholds": [],
"timeRegions": [],
"title": "GPU Temperature",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"format": "celsius",
"logBase": 1,
"show": true
},
{
"format": "short",
"logBase": 1,
"show": true
}
],
"yaxis": {
"align": false
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": {
"uid": "$datasource"
},
"fieldConfig": {
"defaults": {
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 13,
"x": 11,
"y": 24
},
"hiddenSeries": false,
"id": 10,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "9.1.6",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"datasource": {
"uid": "$datasource"
},
"expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"$instance\", gpu=~\"$gpu\"}",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"refId": "A"
}
],
"thresholds": [],
"timeRegions": [],
"title": "GPU Power Usage",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"format": "watt",
"logBase": 1,
"show": true
},
{
"format": "short",
"logBase": 1,
"show": true
}
],
"yaxis": {
"align": false
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": {
"uid": "$datasource"
},
"fieldConfig": {
"defaults": {
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 11,
"x": 0,
"y": 32
},
"hiddenSeries": false,
"id": 2,
"interval": "",
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "9.1.6",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"datasource": {
"uid": "$datasource"
},
"expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"$instance\", gpu=~\"$gpu\"} * 1000000",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "GPU {{gpu}}",
"refId": "A"
}
],
"thresholds": [],
"timeRegions": [],
"title": "GPU SM Clocks",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"format": "hertz",
"label": "",
"logBase": 1,
"show": true
},
{
"format": "short",
"logBase": 1,
"show": true
}
],
"yaxis": {
"align": false
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"fieldConfig": {
"defaults": {
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 13,
"x": 11,
"y": 32
},
"hiddenSeries": false,
"id": 6,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "9.1.6",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"datasource": {
"uid": "$datasource"
},
"editorMode": "code",
"expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"$instance\", gpu=~\"$gpu\"}",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"range": true,
"refId": "A"
}
],
"thresholds": [],
"timeRegions": [],
"title": "GPU Utilization",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"format": "percent",
"logBase": 1,
"max": "100",
"min": "0",
"show": true
},
{
"format": "short",
"logBase": 1,
"show": true
}
],
"yaxis": {
"align": false
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"fieldConfig": {
"defaults": {
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 11,
"x": 0,
"y": 40
},
"hiddenSeries": false,
"id": 4,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "9.1.6",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"datasource": {
"uid": "$datasource"
},
"editorMode": "code",
"expr": "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"$instance\", gpu=~\"$gpu\"}",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"range": true,
"refId": "A"
}
],
"thresholds": [],
"timeRegions": [],
"title": "Tensor Core Utilization",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"format": "percentunit",
"logBase": 1,
"max": "1",
"min": "0",
"show": true
},
{
"format": "short",
"logBase": 1,
"show": true
}
],
"yaxis": {
"align": false
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"fieldConfig": {
"defaults": {
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 13,
"x": 11,
"y": 40
},
"hiddenSeries": false,
"id": 18,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "9.1.6",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"datasource": {
"uid": "$datasource"
},
"editorMode": "code",
"expr": "DCGM_FI_DEV_FB_USED{instance=~\"$instance\", gpu=~\"$gpu\"}",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"range": true,
"refId": "A"
}
],
"thresholds": [],
"timeRegions": [],
"title": "GPU Framebuffer Mem Used",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"format": "decmbytes",
"logBase": 1,
"show": true
},
{
"format": "short",
"logBase": 1,
"show": true
}
],
"yaxis": {
"align": false
}
}
],
"refresh": false,
"schemaVersion": 37,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "Prometheus",
"value": "Prometheus"
},
"hide": 0,
"includeAll": false,
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "instance",
"options": [],
"query": {
"query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)",
"refId": "Prometheus-instance-Variable-Query"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"definition": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "gpu",
"options": [],
"query": {
"query": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)",
"refId": "Prometheus-gpu-Variable-Query"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-15m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
]
},
"timezone": "",
"title": "NVIDIA DCGM Exporter Dashboard V2",
"uid": "Oxed_c6Wz",
"version": 14,
"weekStart": ""
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment