Skip to content

Instantly share code, notes, and snippets.

@corenel
Created August 13, 2020 12:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save corenel/6312c37040eec418c09f41909af33fae to your computer and use it in GitHub Desktop.
Save corenel/6312c37040eec418c09f41909af33fae to your computer and use it in GitHub Desktop.
GPU-Nodes-Metrics-Nvidia
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"limit": 100,
"name": "Annotations & Alerts",
"showIn": 0,
"type": "dashboard"
}
]
},
"description": "使用NVIDIA Data Center GPU Manager (DCGM) dcgm-exporter 通过Prometheus绘制的GPU Nvidia 基础监控信息.",
"editable": true,
"gnetId": 12639,
"graphTooltip": 0,
"id": 1,
"iteration": 1597321805205,
"links": [],
"panels": [
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"format": "hertz",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 2,
"w": 12,
"x": 0,
"y": 0
},
"id": 44,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "GPU SM 时钟",
"targets": [
{
"expr": "avg(DCGM_FI_DEV_SM_CLOCK{instance=~\"$hostname\"}*1000000)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "GPU SM 时钟",
"refId": "A"
}
],
"thresholds": "",
"title": "GPU SM Clocks",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"format": "hertz",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 2,
"w": 12,
"x": 12,
"y": 0
},
"id": 45,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "GPU 内存时钟",
"targets": [
{
"expr": "avg(DCGM_FI_DEV_MEM_CLOCK{instance=~\"$hostname\"}*1000000)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "GPU 内存时钟",
"refId": "A"
}
],
"thresholds": "",
"title": "GPU Memory Clocks",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 8,
"x": 0,
"y": 2
},
"hiddenSeries": false,
"id": 57,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pluginVersion": "7.1.3",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"$hostname\"}",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{instance}}.{{gpu}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GPU 使用率",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percent",
"label": null,
"logBase": 1,
"max": "100",
"min": "0",
"show": true
},
{
"format": "watt",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "内存使用",
"fieldConfig": {
"defaults": {
"custom": {},
"links": [],
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 8,
"x": 8,
"y": 2
},
"hiddenSeries": false,
"id": 60,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"percentage": false,
"pluginVersion": "7.1.3",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "DCGM_FI_DEV_FB_USED{instance=~\"$hostname\"}",
"interval": "",
"legendFormat": "{{instance}}.{{gpu}}",
"refId": "B"
},
{
"expr": "DCGM_FI_DEV_FB_USED{instance=~\"$hostname\"}+DCGM_FI_DEV_FB_FREE{instance=~\"$hostname\"}",
"hide": true,
"interval": "",
"legendFormat": "GPU 总内存 {{instance}}.{{gpu}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GPU 内存用量",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "decmbytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"format": "celsius",
"gauge": {
"maxValue": 90,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 5,
"w": 4,
"x": 16,
"y": 2
},
"id": 31,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"$hostname\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"thresholds": "83,87",
"title": "GPU 平均温度",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"format": "watt",
"gauge": {
"maxValue": 2400,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 5,
"w": 4,
"x": 20,
"y": 2
},
"id": 30,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"$hostname\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"thresholds": "1800,2200",
"title": "GPU 总功率",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "Prometheus",
"description": "",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 5,
"w": 4,
"x": 16,
"y": 7
},
"id": 40,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "avg(DCGM_FI_DEV_MEM_COPY_UTIL{instance=~\"$hostname\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"thresholds": "70,90",
"title": "GPU 总内存利用率",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "Prometheus",
"description": "",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 5,
"w": 4,
"x": 20,
"y": 7
},
"id": 58,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "avg(DCGM_FI_DEV_GPU_UTIL{instance=~\"$hostname\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"thresholds": "80,90",
"title": "GPU 总利用率",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 6,
"w": 8,
"x": 0,
"y": 9
},
"hiddenSeries": false,
"id": 24,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pluginVersion": "7.1.3",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"$hostname\"}",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{instance}}.{{gpu}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GPU 功率",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "watt",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "watt",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "内存利用率\n",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 10,
"w": 8,
"x": 8,
"y": 11
},
"hiddenSeries": false,
"id": 39,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pluginVersion": "7.1.3",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "DCGM_FI_DEV_MEM_COPY_UTIL{instance=~\"$hostname\"}",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{instance}}.{{gpu}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GPU 内存利用率",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percent",
"label": null,
"logBase": 1,
"max": "100",
"min": "0",
"show": true
},
{
"format": "watt",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"custom": {},
"links": [],
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "#6ED0E0",
"value": 25
},
{
"color": "#EAB839",
"value": 50
},
{
"color": "red",
"value": 75
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 8,
"x": 16,
"y": 12
},
"id": 42,
"links": [],
"options": {
"displayMode": "lcd",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"showUnfilled": true
},
"pluginVersion": "7.1.3",
"targets": [
{
"expr": "(DCGM_FI_DEV_FB_USED{instance=~\"$hostname\"}/(DCGM_FI_DEV_FB_USED{instance=~\"$hostname\"}+DCGM_FI_DEV_FB_FREE{instance=~\"$hostname\"}))*100",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{instance}}.{{gpu}}",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "GPU 内存使用率",
"type": "bargauge"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 6,
"w": 8,
"x": 0,
"y": 15
},
"hiddenSeries": false,
"id": 25,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pluginVersion": "7.1.3",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"$hostname\"} ",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{instance}}.{{gpu}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GPU 温度",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "celsius",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"refresh": "5s",
"schemaVersion": 26,
"style": "dark",
"tags": [
"GPU"
],
"templating": {
"list": [
{
"allValue": null,
"current": {
"selected": true,
"tags": [],
"text": "lab-81 + lab-80 + lab-73 + lab-72 + lab-71 + lab-70 + lab-61 + lab-60",
"value": [
"lab-81",
"lab-80",
"lab-73",
"lab-72",
"lab-71",
"lab-70",
"lab-61",
"lab-60"
]
},
"datasource": "Prometheus",
"definition": "label_values(instance)",
"hide": 0,
"includeAll": false,
"label": "host",
"multi": true,
"name": "hostname",
"options": [],
"query": "label_values(instance)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 6,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-30m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "GPU-Nodes-Metrics-Nvidia",
"uid": "hpcsyl6zhqk",
"version": 6
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment