File: //usr/lib/netdata/conf.d/health.d/azure_monitor_machine_learning.conf
# you can disable an alarm notification by setting the 'to' line to: silent
# --- Quota ---
template: am_ml_quota_utilization
on: azure_monitor.machine_learning.quota_utilization
class: Utilization
type: Other
component: Azure ML
lookup: average -5m unaligned of utilization
units: percentage
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: ML quota utilization on ${label:resource_name}
info: Average compute quota utilization on Azure ML workspace ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
High quota utilization may prevent new jobs from starting.
to: sysadmin
# --- Cluster Cores ---
template: am_ml_unusable_cores
on: azure_monitor.machine_learning.cluster_cores
class: Errors
type: Other
component: Azure ML
lookup: average -5m unaligned of unusable
units: cores
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (2))
crit: $this > (($status == $CRITICAL) ? (2) : (10))
delay: down 5m multiplier 1.5 max 1h
summary: ML unusable cores on ${label:resource_name}
info: Number of unusable compute cores on Azure ML workspace ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Unusable cores indicate hardware or configuration failures.
to: sysadmin
template: am_ml_preempted_cores
on: azure_monitor.machine_learning.cluster_cores
class: Workload
type: Other
component: Azure ML
lookup: average -5m unaligned of preempted
units: cores
every: 1m
warn: $this > (($status >= $WARNING) ? (5) : (10))
delay: down 5m multiplier 1.5 max 1h
summary: ML preempted cores on ${label:resource_name}
info: Number of preempted compute cores on Azure ML workspace ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Preempted cores may cause training job interruptions.
to: sysadmin
# --- Cluster Nodes ---
template: am_ml_unusable_nodes
on: azure_monitor.machine_learning.cluster_nodes
class: Errors
type: Other
component: Azure ML
lookup: average -5m unaligned of unusable
units: nodes
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (1))
crit: $this > (($status == $CRITICAL) ? (1) : (5))
delay: down 5m multiplier 1.5 max 1h
summary: ML unusable nodes on ${label:resource_name}
info: Number of unusable compute nodes on Azure ML workspace ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Unusable nodes indicate hardware or configuration failures.
to: sysadmin
# --- CPU Utilization ---
template: am_ml_cpu_utilization
on: azure_monitor.machine_learning.cpu_utilization
class: Utilization
type: Other
component: Azure ML
lookup: average -5m unaligned of cluster_cpu
units: percentage
every: 1m
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: ML CPU utilization on ${label:resource_name}
info: Average cluster CPU utilization on Azure ML workspace ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- CPU Memory Utilization ---
template: am_ml_cpu_memory_utilization
on: azure_monitor.machine_learning.cpu_memory_utilization
class: Utilization
type: Other
component: Azure ML
lookup: average -5m unaligned of utilization
units: percentage
every: 1m
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: ML CPU memory utilization on ${label:resource_name}
info: Average CPU memory utilization on Azure ML workspace ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- GPU Utilization ---
template: am_ml_gpu_utilization
on: azure_monitor.machine_learning.gpu_utilization
class: Utilization
type: Other
component: Azure ML
lookup: average -5m unaligned of cluster_gpu
units: percentage
every: 1m
warn: $this > (($status >= $WARNING) ? (85) : (95))
crit: $this > (($status == $CRITICAL) ? (95) : (99))
delay: down 5m multiplier 1.5 max 1h
summary: ML GPU utilization on ${label:resource_name}
info: Average cluster GPU utilization on Azure ML workspace ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- GPU Memory Utilization ---
template: am_ml_gpu_memory_utilization
on: azure_monitor.machine_learning.gpu_memory_utilization
class: Utilization
type: Other
component: Azure ML
lookup: average -5m unaligned of cluster_gpu_memory
units: percentage
every: 1m
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: ML GPU memory utilization on ${label:resource_name}
info: Average cluster GPU memory utilization on Azure ML workspace ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
High GPU memory usage may cause out-of-memory training failures.
to: sysadmin
# --- Disk Usage ---
template: am_ml_disk_utilization
on: azure_monitor.machine_learning.disk_usage
class: Utilization
type: Other
component: Azure ML
calc: ($used + $available > 0) ? ($used * 100 / ($used + $available)) : (0)
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: ML disk utilization on ${label:resource_name}
info: Disk utilization percentage on Azure ML workspace ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
High disk usage can cause training job failures.
to: sysadmin
# --- Model Deployments ---
template: am_ml_model_deploy_failures
on: azure_monitor.machine_learning.model_deployments
class: Errors
type: Other
component: Azure ML
lookup: sum -5m unaligned of failed
units: deployments
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (1))
crit: $this > (($status == $CRITICAL) ? (1) : (5))
delay: down 5m multiplier 1.5 max 1h
summary: ML model deployment failures on ${label:resource_name}
info: Number of failed model deployments over the last 5 minutes on Azure ML workspace ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Model Registrations ---
template: am_ml_model_register_failures
on: azure_monitor.machine_learning.model_registrations
class: Errors
type: Other
component: Azure ML
lookup: sum -5m unaligned of failed
units: registrations
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (1))
crit: $this > (($status == $CRITICAL) ? (1) : (5))
delay: down 5m multiplier 1.5 max 1h
summary: ML model registration failures on ${label:resource_name}
info: Number of failed model registrations over the last 5 minutes on Azure ML workspace ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Run Completion ---
template: am_ml_failed_runs
on: azure_monitor.machine_learning.run_completion
class: Errors
type: Other
component: Azure ML
lookup: sum -5m unaligned of failed
units: runs
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (3))
crit: $this > (($status == $CRITICAL) ? (3) : (10))
delay: down 5m multiplier 1.5 max 1h
summary: ML failed runs on ${label:resource_name}
info: Number of failed training/experiment runs over the last 5 minutes on Azure ML workspace ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
template: am_ml_not_responding_runs
on: azure_monitor.machine_learning.run_completion
class: Availability
type: Other
component: Azure ML
lookup: sum -5m unaligned of not_responding
units: runs
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (1))
crit: $this > (($status == $CRITICAL) ? (1) : (5))
delay: down 5m multiplier 1.5 max 1h
summary: ML not-responding runs on ${label:resource_name}
info: Number of runs that stopped responding over the last 5 minutes on Azure ML workspace ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Not-responding runs indicate compute or infrastructure issues.
to: sysadmin
# --- Run Issues ---
template: am_ml_run_errors
on: azure_monitor.machine_learning.run_issues
class: Errors
type: Other
component: Azure ML
lookup: sum -5m unaligned of errors
units: errors
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
crit: $this > (($status == $CRITICAL) ? (5) : (20))
delay: down 5m multiplier 1.5 max 1h
summary: ML run errors on ${label:resource_name}
info: Number of run errors over the last 5 minutes on Azure ML workspace ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Storage API ---
template: am_ml_storage_api_total
on: azure_monitor.machine_learning.storage_api_calls
class: Workload
type: Other
component: Azure ML
lookup: sum -5m unaligned of success,failure
units: calls
every: 1m
info: Total storage API calls over the last 5 minutes on Azure ML workspace ${label:resource_name}
template: am_ml_storage_api_failures
on: azure_monitor.machine_learning.storage_api_calls
class: Errors
type: Other
component: Azure ML
lookup: sum -5m unaligned of failure
calc: ($am_ml_storage_api_total > 10) ? ($this * 100 / $am_ml_storage_api_total) : (0)
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (1) : (5))
crit: $this > (($status == $CRITICAL) ? (5) : (15))
delay: down 5m multiplier 1.5 max 1h
summary: ML storage API failure rate on ${label:resource_name}
info: Percentage of failed storage API calls over the last 5 minutes on Azure ML workspace ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin