File: //usr/lib/netdata/conf.d/health.d/azure_monitor_data_factory.conf
# you can disable an alarm notification by setting the 'to' line to: silent
# --- Pipeline Runs ---
template: am_data_factory_pipeline_failed_runs
on: azure_monitor.data_factory.pipeline_runs
class: Errors
type: Other
component: Azure Data Factory
lookup: sum -5m unaligned of failed
units: runs
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory pipeline failures on ${label:resource_name}
info: Failed pipeline runs on Data Factory ${label:resource_name} \
in ${label:resource_group} (${label:region}) over the last 5 minutes
to: sysadmin
template: am_data_factory_pipeline_cancelled_runs
on: azure_monitor.data_factory.pipeline_runs
class: Errors
type: Other
component: Azure Data Factory
lookup: sum -10m unaligned of cancelled
units: runs
every: 1m
warn: $this > (($status >= $WARNING) ? (5) : (10))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory pipeline cancellations on ${label:resource_name}
info: Cancelled pipeline runs on Data Factory ${label:resource_name} \
in ${label:resource_group} (${label:region}) over the last 10 minutes. \
Frequent cancellations may indicate configuration or dependency issues
to: sysadmin
# --- Activity Runs ---
template: am_data_factory_activity_failed_runs
on: azure_monitor.data_factory.activity_runs
class: Errors
type: Other
component: Azure Data Factory
lookup: sum -5m unaligned of failed
units: runs
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory activity failures on ${label:resource_name}
info: Failed activity runs on Data Factory ${label:resource_name} \
in ${label:resource_group} (${label:region}) over the last 5 minutes
to: sysadmin
# --- Trigger Runs ---
template: am_data_factory_trigger_failed_runs
on: azure_monitor.data_factory.trigger_runs
class: Errors
type: Other
component: Azure Data Factory
lookup: sum -5m unaligned of failed
units: runs
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory trigger failures on ${label:resource_name}
info: Failed trigger runs on Data Factory ${label:resource_name} \
in ${label:resource_group} (${label:region}) over the last 5 minutes
to: sysadmin
# --- SSIS Integration Runtime ---
template: am_data_factory_ssis_ir_start_failures
on: azure_monitor.data_factory.ssis_ir_starts
class: Errors
type: Other
component: Azure Data Factory
lookup: sum -5m unaligned of failed
units: runs
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (0) : (1))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory SSIS IR start failures on ${label:resource_name}
info: Failed SSIS Integration Runtime start operations on Data Factory \
${label:resource_name} in ${label:resource_group} (${label:region})
to: sysadmin
template: am_data_factory_ssis_ir_stop_stuck
on: azure_monitor.data_factory.ssis_ir_stops
class: Errors
type: Other
component: Azure Data Factory
lookup: sum -5m unaligned of stuck
units: runs
every: 1m
warn: $this != nan AND $this > 0
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory SSIS IR stuck stops on ${label:resource_name}
info: SSIS Integration Runtime stop operations stuck on Data Factory \
${label:resource_name} in ${label:resource_group} (${label:region})
to: sysadmin
template: am_data_factory_ssis_package_failures
on: azure_monitor.data_factory.ssis_package_executions
class: Errors
type: Other
component: Azure Data Factory
lookup: sum -5m unaligned of failed
units: executions
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (0) : (1))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory SSIS package failures on ${label:resource_name}
info: Failed SSIS package executions on Data Factory ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Integration Runtime Resources ---
template: am_data_factory_ir_cpu
on: azure_monitor.data_factory.ir_cpu
class: Utilization
type: Other
component: Azure Data Factory
lookup: average -5m unaligned of average
units: percentage
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory IR CPU on ${label:resource_name}
info: Integration Runtime CPU utilization on Data Factory ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
template: am_data_factory_ir_queue_length
on: azure_monitor.data_factory.ir_queue
class: Workload
type: Other
component: Azure Data Factory
lookup: average -5m unaligned of queue_length
units: tasks
every: 1m
warn: $this > (($status >= $WARNING) ? (10) : (20))
crit: $this > (($status == $CRITICAL) ? (20) : (50))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory IR queue depth on ${label:resource_name}
info: Integration Runtime queue length on Data Factory ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Growing queues indicate the runtime cannot keep up with submitted work
to: sysadmin
template: am_data_factory_ir_task_pickup_delay
on: azure_monitor.data_factory.ir_task_pickup_delay
class: Latency
type: Other
component: Azure Data Factory
lookup: average -5m unaligned of average
units: seconds
every: 1m
warn: $this > (($status >= $WARNING) ? (30) : (60))
crit: $this > (($status == $CRITICAL) ? (60) : (120))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory IR task pickup delay on ${label:resource_name}
info: Average task pickup delay for Integration Runtime on Data Factory \
${label:resource_name} in ${label:resource_group} (${label:region}). \
High delay indicates insufficient runtime capacity
to: sysadmin
# --- Factory Capacity ---
template: am_data_factory_size_utilization
on: azure_monitor.data_factory.factory_size
class: Utilization
type: Other
component: Azure Data Factory
calc: ($max_allowed > 0) ? ($current * 100 / $max_allowed) : (0)
units: %
every: 5m
warn: $this > (($status >= $WARNING) ? (70) : (80))
crit: $this > (($status == $CRITICAL) ? (80) : (90))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory size utilization on ${label:resource_name}
info: Factory size as percentage of maximum allowed on Data Factory \
${label:resource_name} in ${label:resource_group} (${label:region})
to: sysadmin
template: am_data_factory_entity_utilization
on: azure_monitor.data_factory.entity_count
class: Utilization
type: Other
component: Azure Data Factory
calc: ($max_allowed > 0) ? ($current * 100 / $max_allowed) : (0)
units: %
every: 5m
warn: $this > (($status >= $WARNING) ? (70) : (80))
crit: $this > (($status == $CRITICAL) ? (80) : (90))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory entity count on ${label:resource_name}
info: Entity count (pipelines, datasets, etc.) as percentage of maximum allowed \
on Data Factory ${label:resource_name} in ${label:resource_group} (${label:region})
to: sysadmin
# --- MVNet IR Capacity ---
template: am_data_factory_mvnet_ir_copy_utilization
on: azure_monitor.data_factory.mvnet_ir_copy_capacity
class: Utilization
type: Other
component: Azure Data Factory
lookup: average -5m unaligned of utilization
units: percentage
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (70) : (80))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (80) : (90))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory MVNet IR copy utilization on ${label:resource_name}
info: Managed VNet Integration Runtime copy capacity utilization on Data Factory \
${label:resource_name} in ${label:resource_group} (${label:region})
to: sysadmin
template: am_data_factory_mvnet_ir_external_utilization
on: azure_monitor.data_factory.mvnet_ir_external_capacity
class: Utilization
type: Other
component: Azure Data Factory
lookup: average -5m unaligned of utilization
units: percentage
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (70) : (80))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (80) : (90))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory MVNet IR external utilization on ${label:resource_name}
info: Managed VNet Integration Runtime external activity capacity utilization \
on Data Factory ${label:resource_name} in ${label:resource_group} (${label:region})
to: sysadmin
template: am_data_factory_mvnet_ir_pipeline_utilization
on: azure_monitor.data_factory.mvnet_ir_pipeline_capacity
class: Utilization
type: Other
component: Azure Data Factory
lookup: average -5m unaligned of utilization
units: percentage
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (70) : (80))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (80) : (90))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory MVNet IR pipeline utilization on ${label:resource_name}
info: Managed VNet Integration Runtime pipeline capacity utilization \
on Data Factory ${label:resource_name} in ${label:resource_group} (${label:region})
to: sysadmin
# --- Airflow IR Resources ---
template: am_data_factory_airflow_ir_cpu
on: azure_monitor.data_factory.airflow_ir_cpu
class: Utilization
type: Other
component: Azure Data Factory
lookup: average -5m unaligned of percentage
units: percentage
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (75) : (85))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory Airflow IR CPU on ${label:resource_name}
info: Airflow Integration Runtime CPU utilization on Data Factory \
${label:resource_name} in ${label:resource_group} (${label:region})
to: sysadmin
template: am_data_factory_airflow_ir_memory
on: azure_monitor.data_factory.airflow_ir_memory
class: Utilization
type: Other
component: Azure Data Factory
lookup: average -5m unaligned of percentage
units: percentage
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (75) : (85))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory Airflow IR memory on ${label:resource_name}
info: Airflow Integration Runtime memory utilization on Data Factory \
${label:resource_name} in ${label:resource_group} (${label:region})
to: sysadmin
# --- Airflow IR DAG Errors ---
template: am_data_factory_airflow_ir_dag_errors
on: azure_monitor.data_factory.airflow_ir_dag_errors
class: Errors
type: Other
component: Azure Data Factory
lookup: sum -5m unaligned of callback_exceptions,file_refresh,import
units: errors
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (0) : (5))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory Airflow DAG errors on ${label:resource_name}
info: DAG processing errors (callback exceptions, file refresh errors, import errors) \
on Airflow IR of Data Factory ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Airflow IR Operators ---
template: am_data_factory_airflow_ir_operator_failures
on: azure_monitor.data_factory.airflow_ir_operators
class: Errors
type: Other
component: Azure Data Factory
lookup: sum -5m unaligned of failures
units: operations
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (0) : (5))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory Airflow operator failures on ${label:resource_name}
info: Failed Airflow operator executions on Data Factory ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Airflow IR Jobs ---
template: am_data_factory_airflow_ir_job_heartbeat_failures
on: azure_monitor.data_factory.airflow_ir_jobs
class: Errors
type: Other
component: Azure Data Factory
lookup: sum -5m unaligned of heartbeat_failures
units: failures
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (0) : (1))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory Airflow job heartbeat failures on ${label:resource_name}
info: Airflow job heartbeat failures on Data Factory ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Heartbeat failures indicate scheduler or worker health issues
to: sysadmin
# --- Airflow IR Pool Starvation ---
template: am_data_factory_airflow_ir_pool_starving
on: azure_monitor.data_factory.airflow_ir_pool_starving
class: Workload
type: Other
component: Azure Data Factory
lookup: sum -5m unaligned of starving
units: tasks
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (0) : (5))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory Airflow pool starvation on ${label:resource_name}
info: Starving tasks in Airflow pool on Data Factory ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Tasks are waiting for pool slots, consider increasing pool size
to: sysadmin
# --- Airflow IR Scheduler Tasks ---
template: am_data_factory_airflow_ir_tasks_killed_externally
on: azure_monitor.data_factory.airflow_ir_scheduler_tasks
class: Errors
type: Other
component: Azure Data Factory
lookup: sum -5m unaligned of killed_externally
units: tasks
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (0) : (3))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory Airflow tasks killed externally on ${label:resource_name}
info: Tasks killed externally by the Airflow scheduler on Data Factory \
${label:resource_name} in ${label:resource_group} (${label:region}). \
May indicate OOM kills or infrastructure issues
to: sysadmin
template: am_data_factory_airflow_ir_tasks_starving
on: azure_monitor.data_factory.airflow_ir_scheduler_tasks
class: Workload
type: Other
component: Azure Data Factory
lookup: sum -5m unaligned of starving
units: tasks
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (0) : (10))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory Airflow scheduler starving tasks on ${label:resource_name}
info: Starving tasks reported by Airflow scheduler on Data Factory \
${label:resource_name} in ${label:resource_group} (${label:region}). \
Tasks cannot be scheduled due to resource constraints
to: sysadmin
# --- Airflow IR Task Instances ---
template: am_data_factory_airflow_ir_task_instance_failures
on: azure_monitor.data_factory.airflow_ir_task_instances
class: Errors
type: Other
component: Azure Data Factory
lookup: sum -5m unaligned of failed
units: instances
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (0) : (5))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory Airflow task failures on ${label:resource_name}
info: Failed Airflow task instances on Data Factory ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Airflow IR Trigger Issues ---
template: am_data_factory_airflow_ir_trigger_issues
on: azure_monitor.data_factory.airflow_ir_trigger_issues
class: Errors
type: Other
component: Azure Data Factory
lookup: sum -5m unaligned of blocked_main_thread,celery_timeout_errors
units: events
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (0) : (3))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory Airflow trigger issues on ${label:resource_name}
info: Trigger issues (blocked main thread, Celery timeouts) on Airflow IR \
of Data Factory ${label:resource_name} in ${label:resource_group} (${label:region})
to: sysadmin
# --- Airflow IR Zombies ---
template: am_data_factory_airflow_ir_zombies
on: azure_monitor.data_factory.airflow_ir_zombies
class: Errors
type: Other
component: Azure Data Factory
lookup: sum -5m unaligned of killed
units: tasks
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (0) : (3))
delay: down 5m multiplier 1.5 max 1h
summary: Data Factory Airflow zombie tasks on ${label:resource_name}
info: Zombie tasks killed by Airflow on Data Factory ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Zombies occur when tasks are marked running but no process is executing them
to: sysadmin