GrenXPaRTa

File: //usr/lib/netdata/conf.d/health.d/azure_monitor_machine_learning.conf
# you can disable an alarm notification by setting the 'to' line to: silent

# --- Quota ---

 template: am_ml_quota_utilization
       on: azure_monitor.machine_learning.quota_utilization
    class: Utilization
     type: Other
component: Azure ML
   lookup: average -5m unaligned of utilization
    units: percentage
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 5m multiplier 1.5 max 1h
  summary: ML quota utilization on ${label:resource_name}
     info: Average compute quota utilization on Azure ML workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region}). \
           High quota utilization may prevent new jobs from starting.
       to: sysadmin

# --- Cluster Cores ---

 template: am_ml_unusable_cores
       on: azure_monitor.machine_learning.cluster_cores
    class: Errors
     type: Other
component: Azure ML
   lookup: average -5m unaligned of unusable
    units: cores
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (0) : (2))
     crit: $this > (($status == $CRITICAL) ? (2) : (10))
    delay: down 5m multiplier 1.5 max 1h
  summary: ML unusable cores on ${label:resource_name}
     info: Number of unusable compute cores on Azure ML workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region}). \
           Unusable cores indicate hardware or configuration failures.
       to: sysadmin

 template: am_ml_preempted_cores
       on: azure_monitor.machine_learning.cluster_cores
    class: Workload
     type: Other
component: Azure ML
   lookup: average -5m unaligned of preempted
    units: cores
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (5) : (10))
    delay: down 5m multiplier 1.5 max 1h
  summary: ML preempted cores on ${label:resource_name}
     info: Number of preempted compute cores on Azure ML workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region}). \
           Preempted cores may cause training job interruptions.
       to: sysadmin

# --- Cluster Nodes ---

 template: am_ml_unusable_nodes
       on: azure_monitor.machine_learning.cluster_nodes
    class: Errors
     type: Other
component: Azure ML
   lookup: average -5m unaligned of unusable
    units: nodes
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (0) : (1))
     crit: $this > (($status == $CRITICAL) ? (1) : (5))
    delay: down 5m multiplier 1.5 max 1h
  summary: ML unusable nodes on ${label:resource_name}
     info: Number of unusable compute nodes on Azure ML workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region}). \
           Unusable nodes indicate hardware or configuration failures.
       to: sysadmin

# --- CPU Utilization ---

 template: am_ml_cpu_utilization
       on: azure_monitor.machine_learning.cpu_utilization
    class: Utilization
     type: Other
component: Azure ML
   lookup: average -5m unaligned of cluster_cpu
    units: percentage
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (95))
    delay: down 5m multiplier 1.5 max 1h
  summary: ML CPU utilization on ${label:resource_name}
     info: Average cluster CPU utilization on Azure ML workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

# --- CPU Memory Utilization ---

 template: am_ml_cpu_memory_utilization
       on: azure_monitor.machine_learning.cpu_memory_utilization
    class: Utilization
     type: Other
component: Azure ML
   lookup: average -5m unaligned of utilization
    units: percentage
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (95))
    delay: down 5m multiplier 1.5 max 1h
  summary: ML CPU memory utilization on ${label:resource_name}
     info: Average CPU memory utilization on Azure ML workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

# --- GPU Utilization ---

 template: am_ml_gpu_utilization
       on: azure_monitor.machine_learning.gpu_utilization
    class: Utilization
     type: Other
component: Azure ML
   lookup: average -5m unaligned of cluster_gpu
    units: percentage
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (85) : (95))
     crit: $this > (($status == $CRITICAL) ? (95) : (99))
    delay: down 5m multiplier 1.5 max 1h
  summary: ML GPU utilization on ${label:resource_name}
     info: Average cluster GPU utilization on Azure ML workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

# --- GPU Memory Utilization ---

 template: am_ml_gpu_memory_utilization
       on: azure_monitor.machine_learning.gpu_memory_utilization
    class: Utilization
     type: Other
component: Azure ML
   lookup: average -5m unaligned of cluster_gpu_memory
    units: percentage
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (95))
    delay: down 5m multiplier 1.5 max 1h
  summary: ML GPU memory utilization on ${label:resource_name}
     info: Average cluster GPU memory utilization on Azure ML workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region}). \
           High GPU memory usage may cause out-of-memory training failures.
       to: sysadmin

# --- Disk Usage ---

 template: am_ml_disk_utilization
       on: azure_monitor.machine_learning.disk_usage
    class: Utilization
     type: Other
component: Azure ML
     calc: ($used + $available > 0) ? ($used * 100 / ($used + $available)) : (0)
    units: %
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 5m multiplier 1.5 max 1h
  summary: ML disk utilization on ${label:resource_name}
     info: Disk utilization percentage on Azure ML workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region}). \
           High disk usage can cause training job failures.
       to: sysadmin

# --- Model Deployments ---

 template: am_ml_model_deploy_failures
       on: azure_monitor.machine_learning.model_deployments
    class: Errors
     type: Other
component: Azure ML
   lookup: sum -5m unaligned of failed
    units: deployments
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (0) : (1))
     crit: $this > (($status == $CRITICAL) ? (1) : (5))
    delay: down 5m multiplier 1.5 max 1h
  summary: ML model deployment failures on ${label:resource_name}
     info: Number of failed model deployments over the last 5 minutes on Azure ML workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

# --- Model Registrations ---

 template: am_ml_model_register_failures
       on: azure_monitor.machine_learning.model_registrations
    class: Errors
     type: Other
component: Azure ML
   lookup: sum -5m unaligned of failed
    units: registrations
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (0) : (1))
     crit: $this > (($status == $CRITICAL) ? (1) : (5))
    delay: down 5m multiplier 1.5 max 1h
  summary: ML model registration failures on ${label:resource_name}
     info: Number of failed model registrations over the last 5 minutes on Azure ML workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

# --- Run Completion ---

 template: am_ml_failed_runs
       on: azure_monitor.machine_learning.run_completion
    class: Errors
     type: Other
component: Azure ML
   lookup: sum -5m unaligned of failed
    units: runs
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (0) : (3))
     crit: $this > (($status == $CRITICAL) ? (3) : (10))
    delay: down 5m multiplier 1.5 max 1h
  summary: ML failed runs on ${label:resource_name}
     info: Number of failed training/experiment runs over the last 5 minutes on Azure ML workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

 template: am_ml_not_responding_runs
       on: azure_monitor.machine_learning.run_completion
    class: Availability
     type: Other
component: Azure ML
   lookup: sum -5m unaligned of not_responding
    units: runs
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (0) : (1))
     crit: $this > (($status == $CRITICAL) ? (1) : (5))
    delay: down 5m multiplier 1.5 max 1h
  summary: ML not-responding runs on ${label:resource_name}
     info: Number of runs that stopped responding over the last 5 minutes on Azure ML workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region}). \
           Not-responding runs indicate compute or infrastructure issues.
       to: sysadmin

# --- Run Issues ---

 template: am_ml_run_errors
       on: azure_monitor.machine_learning.run_issues
    class: Errors
     type: Other
component: Azure ML
   lookup: sum -5m unaligned of errors
    units: errors
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (0) : (5))
     crit: $this > (($status == $CRITICAL) ? (5) : (20))
    delay: down 5m multiplier 1.5 max 1h
  summary: ML run errors on ${label:resource_name}
     info: Number of run errors over the last 5 minutes on Azure ML workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

# --- Storage API ---

 template: am_ml_storage_api_total
       on: azure_monitor.machine_learning.storage_api_calls
    class: Workload
     type: Other
component: Azure ML
   lookup: sum -5m unaligned of success,failure
    units: calls
    every: 1m
     info: Total storage API calls over the last 5 minutes on Azure ML workspace ${label:resource_name}

 template: am_ml_storage_api_failures
       on: azure_monitor.machine_learning.storage_api_calls
    class: Errors
     type: Other
component: Azure ML
   lookup: sum -5m unaligned of failure
     calc: ($am_ml_storage_api_total > 10) ? ($this * 100 / $am_ml_storage_api_total) : (0)
    units: %
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (1) : (5))
     crit: $this > (($status == $CRITICAL) ? (5) : (15))
    delay: down 5m multiplier 1.5 max 1h
  summary: ML storage API failure rate on ${label:resource_name}
     info: Percentage of failed storage API calls over the last 5 minutes on Azure ML workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin