Intrusion Exploit
Server: LiteSpeed
System: Linux cisadane.iixcp.rumahweb.net 5.14.0-427.42.1.el9_4.x86_64 #1 SMP PREEMPT_DYNAMIC Fri Nov 1 14:58:02 EDT 2024 x86_64
User: lenf4658 (1805)
PHP: 8.4.19
Disabled: NONE
Upload Files
File: //usr/lib/netdata/conf.d/health.d/dcgm.conf
# DCGM GPU reliability alerts.

 template: dcgm_gpu_xid_errors
       on: dcgm.gpu.reliability.xid
    class: Errors
     type: GPU
component: NVIDIA
   lookup: max -1m unaligned absolute of xid
    units: code
    every: 30s
     warn: $this > 0
    delay: up 30s down 5m multiplier 1.5 max 1h
  summary: DCGM reported XID error on GPU ${label:gpu}
     info: NVIDIA driver reported a GPU XID error (metric ${label:chart_context}).
       to: sysadmin

 template: dcgm_gpu_row_remap_failure
       on: dcgm.gpu.reliability.row_remap_status
    class: Errors
     type: GPU
component: NVIDIA
   lookup: max -1m unaligned absolute of row_remap_failure
    units: state
    every: 30s
     warn: $this > 0
    delay: up 30s down 5m multiplier 1.5 max 1h
  summary: DCGM row remap failure on GPU ${label:gpu}
     info: Row remapping has failed, indicating a persistent memory reliability problem.
       to: sysadmin

 template: dcgm_gpu_uncorrectable_remapped_rows
       on: dcgm.gpu.reliability.row_remap_events
    class: Errors
     type: GPU
component: NVIDIA
   lookup: sum -5m unaligned absolute of uncorrectable_remapped_rows
    units: rows
    every: 30s
     warn: $this > 0
    delay: up 30s down 10m multiplier 1.5 max 1h
  summary: DCGM uncorrectable remapped rows on GPU ${label:gpu}
     info: New uncorrectable row remap events were detected in the last 5 minutes.
       to: sysadmin

# DCGM throttle violation alerts.

 template: dcgm_gpu_power_violation
       on: dcgm.gpu.throttle.violations
    class: Workload
     type: GPU
component: NVIDIA
   lookup: sum -5m unaligned absolute of power_violation
    units: milliseconds
    every: 30s
     warn: $this > 0
    delay: up 1m down 10m multiplier 1.5 max 1h
  summary: DCGM power throttling detected on GPU ${label:gpu}
     info: The GPU was power-throttled during the last 5 minutes.
       to: sysadmin

 template: dcgm_gpu_thermal_violation
       on: dcgm.gpu.throttle.violations
    class: Workload
     type: GPU
component: NVIDIA
   lookup: sum -5m unaligned absolute of thermal_violation
    units: milliseconds
    every: 30s
     warn: $this > 0
    delay: up 1m down 10m multiplier 1.5 max 1h
  summary: DCGM thermal throttling detected on GPU ${label:gpu}
     info: The GPU was thermally throttled during the last 5 minutes.
       to: sysadmin