File: //usr/lib/netdata/conf.d/health.d/dcgm.conf
# DCGM GPU reliability alerts.
template: dcgm_gpu_xid_errors
on: dcgm.gpu.reliability.xid
class: Errors
type: GPU
component: NVIDIA
lookup: max -1m unaligned absolute of xid
units: code
every: 30s
warn: $this > 0
delay: up 30s down 5m multiplier 1.5 max 1h
summary: DCGM reported XID error on GPU ${label:gpu}
info: NVIDIA driver reported a GPU XID error (metric ${label:chart_context}).
to: sysadmin
template: dcgm_gpu_row_remap_failure
on: dcgm.gpu.reliability.row_remap_status
class: Errors
type: GPU
component: NVIDIA
lookup: max -1m unaligned absolute of row_remap_failure
units: state
every: 30s
warn: $this > 0
delay: up 30s down 5m multiplier 1.5 max 1h
summary: DCGM row remap failure on GPU ${label:gpu}
info: Row remapping has failed, indicating a persistent memory reliability problem.
to: sysadmin
template: dcgm_gpu_uncorrectable_remapped_rows
on: dcgm.gpu.reliability.row_remap_events
class: Errors
type: GPU
component: NVIDIA
lookup: sum -5m unaligned absolute of uncorrectable_remapped_rows
units: rows
every: 30s
warn: $this > 0
delay: up 30s down 10m multiplier 1.5 max 1h
summary: DCGM uncorrectable remapped rows on GPU ${label:gpu}
info: New uncorrectable row remap events were detected in the last 5 minutes.
to: sysadmin
# DCGM throttle violation alerts.
template: dcgm_gpu_power_violation
on: dcgm.gpu.throttle.violations
class: Workload
type: GPU
component: NVIDIA
lookup: sum -5m unaligned absolute of power_violation
units: milliseconds
every: 30s
warn: $this > 0
delay: up 1m down 10m multiplier 1.5 max 1h
summary: DCGM power throttling detected on GPU ${label:gpu}
info: The GPU was power-throttled during the last 5 minutes.
to: sysadmin
template: dcgm_gpu_thermal_violation
on: dcgm.gpu.throttle.violations
class: Workload
type: GPU
component: NVIDIA
lookup: sum -5m unaligned absolute of thermal_violation
units: milliseconds
every: 30s
warn: $this > 0
delay: up 1m down 10m multiplier 1.5 max 1h
summary: DCGM thermal throttling detected on GPU ${label:gpu}
info: The GPU was thermally throttled during the last 5 minutes.
to: sysadmin