File: //usr/lib/netdata/conf.d/health.d/azure_monitor_cognitive_services.conf
# you can disable an alarm notification by setting the 'to' line to: silent
# --- Cognitive Services - Availability ---
template: am_cognitive_services_availability
on: azure_monitor.cognitive_services.availability
class: Availability
type: Other
component: Cognitive Services
lookup: average -5m unaligned of availability
units: percentage
every: 1m
warn: $this < (($status >= $WARNING) ? (99.9) : (99))
crit: $this < (($status == $CRITICAL) ? (99) : (90))
delay: down 5m multiplier 1.5 max 1h
summary: Cognitive Services availability on ${label:resource_name}
info: Success rate of Cognitive Services resource ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Cognitive Services - Errors ---
template: am_cognitive_services_server_errors
on: azure_monitor.cognitive_services.errors
class: Errors
type: Other
component: Cognitive Services
lookup: average -5m unaligned of server
units: errors/s
every: 1m
warn: $this > (($status >= $WARNING) ? (1) : (5))
crit: $this > (($status == $CRITICAL) ? (5) : (20))
delay: down 5m multiplier 1.5 max 1h
summary: Cognitive Services server errors on ${label:resource_name}
info: Rate of server errors (5xx) from Cognitive Services resource ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
template: am_cognitive_services_client_errors
on: azure_monitor.cognitive_services.errors
class: Errors
type: Other
component: Cognitive Services
lookup: average -5m unaligned of client
units: errors/s
every: 1m
warn: $this > (($status >= $WARNING) ? (10) : (50))
delay: down 5m multiplier 1.5 max 1h
summary: Cognitive Services client errors on ${label:resource_name}
info: Rate of client errors (4xx) from Cognitive Services resource ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
High client error rates may indicate misconfigured API calls or invalid requests
to: sysadmin
# --- Cognitive Services - Latency ---
template: am_cognitive_services_latency
on: azure_monitor.cognitive_services.latency
class: Latency
type: Other
component: Cognitive Services
lookup: average -5m unaligned of average
units: milliseconds
every: 1m
warn: $this > (($status >= $WARNING) ? (3000) : (5000))
crit: $this > (($status == $CRITICAL) ? (5000) : (10000))
delay: down 5m multiplier 1.5 max 1h
summary: Cognitive Services latency on ${label:resource_name}
info: Average API latency of Cognitive Services resource ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Cognitive Services - Rate Limiting ---
template: am_cognitive_services_rate_limit
on: azure_monitor.cognitive_services.rate_limit
class: Workload
type: Other
component: Cognitive Services
lookup: average -5m unaligned of rate_limit
units: requests/s
every: 1m
warn: $this > (($status >= $WARNING) ? (1) : (5))
crit: $this > (($status == $CRITICAL) ? (5) : (20))
delay: down 5m multiplier 1.5 max 1h
summary: Cognitive Services rate limiting on ${label:resource_name}
info: Rate of throttled requests due to rate limiting on Cognitive Services resource ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Indicates the service is hitting its API call quota
to: sysadmin
# --- Cognitive Services - Blocked Calls ---
template: am_cognitive_services_blocked_calls
on: azure_monitor.cognitive_services.calls
class: Errors
type: Other
component: Cognitive Services
lookup: average -5m unaligned of blocked
units: calls/s
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
crit: $this > (($status == $CRITICAL) ? (5) : (20))
delay: down 5m multiplier 1.5 max 1h
summary: Cognitive Services blocked calls on ${label:resource_name}
info: Rate of blocked API calls on Cognitive Services resource ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Blocked calls indicate authorization or policy violations
to: sysadmin
# --- Azure OpenAI - Availability ---
template: am_cognitive_services_openai_availability
on: azure_monitor.cognitive_services.openai_availability
class: Availability
type: Other
component: Cognitive Services
lookup: average -5m unaligned of availability
units: percentage
every: 1m
warn: $this != nan AND $this < (($status >= $WARNING) ? (99.9) : (99))
crit: $this != nan AND $this < (($status == $CRITICAL) ? (99) : (90))
delay: down 5m multiplier 1.5 max 1h
summary: Azure OpenAI availability on ${label:resource_name}
info: Availability rate of Azure OpenAI service on ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Azure OpenAI - Latency ---
template: am_cognitive_services_openai_time_to_response
on: azure_monitor.cognitive_services.openai_latency
class: Latency
type: Other
component: Cognitive Services
lookup: average -5m unaligned of time_to_response
units: milliseconds
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (5000) : (10000))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (10000) : (30000))
delay: down 5m multiplier 1.5 max 1h
summary: Azure OpenAI time to response on ${label:resource_name}
info: Average time to response for Azure OpenAI requests on ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
template: am_cognitive_services_openai_time_to_first_token
on: azure_monitor.cognitive_services.openai_latency
class: Latency
type: Other
component: Cognitive Services
lookup: average -5m unaligned of time_to_first_token
units: milliseconds
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (3000) : (5000))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (5000) : (10000))
delay: down 5m multiplier 1.5 max 1h
summary: Azure OpenAI time to first token on ${label:resource_name}
info: Average normalized time to first token for Azure OpenAI requests on ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Azure OpenAI - Provisioned Utilization ---
template: am_cognitive_services_openai_provisioned_utilization
on: azure_monitor.cognitive_services.openai_provisioned_utilization
class: Utilization
type: Other
component: Cognitive Services
lookup: average -5m unaligned of utilization
units: percentage
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (70) : (80))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (80) : (90))
delay: down 5m multiplier 1.5 max 1h
summary: Azure OpenAI provisioned utilization on ${label:resource_name}
info: Provisioned-managed utilization of Azure OpenAI deployment on ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
High utilization means requests may be throttled or rejected
to: sysadmin
# --- Models - Availability ---
template: am_cognitive_services_model_availability
on: azure_monitor.cognitive_services.model_availability
class: Availability
type: Other
component: Cognitive Services
lookup: average -5m unaligned of availability
units: percentage
every: 1m
warn: $this != nan AND $this < (($status >= $WARNING) ? (99.9) : (99))
crit: $this != nan AND $this < (($status == $CRITICAL) ? (99) : (90))
delay: down 5m multiplier 1.5 max 1h
summary: Model availability on ${label:resource_name}
info: Model availability rate on Cognitive Services resource ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Models - Latency ---
template: am_cognitive_services_model_time_to_response
on: azure_monitor.cognitive_services.model_latency
class: Latency
type: Other
component: Cognitive Services
lookup: average -5m unaligned of time_to_response
units: milliseconds
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (5000) : (10000))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (10000) : (30000))
delay: down 5m multiplier 1.5 max 1h
summary: Model time to response on ${label:resource_name}
info: Average time to response for model requests on Cognitive Services resource ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
template: am_cognitive_services_model_time_to_first_token
on: azure_monitor.cognitive_services.model_latency
class: Latency
type: Other
component: Cognitive Services
lookup: average -5m unaligned of time_to_first_token
units: milliseconds
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (3000) : (5000))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (5000) : (10000))
delay: down 5m multiplier 1.5 max 1h
summary: Model time to first token on ${label:resource_name}
info: Average normalized time to first token for model requests on Cognitive Services resource ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Models - Provisioned Utilization ---
template: am_cognitive_services_model_provisioned_utilization
on: azure_monitor.cognitive_services.model_provisioned_utilization
class: Utilization
type: Other
component: Cognitive Services
lookup: average -5m unaligned of utilization
units: percentage
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (70) : (80))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (80) : (90))
delay: down 5m multiplier 1.5 max 1h
summary: Model provisioned utilization on ${label:resource_name}
info: Provisioned utilization of model deployment on Cognitive Services resource ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
High utilization means requests may be throttled or rejected
to: sysadmin
# --- Content Safety - Harmful Requests ---
template: am_cognitive_services_harmful_requests
on: azure_monitor.cognitive_services.content_safety_requests
class: Errors
type: Other
component: Cognitive Services
lookup: average -5m unaligned of harmful
units: requests/s
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (1) : (5))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (5) : (20))
delay: down 5m multiplier 1.5 max 1h
summary: Harmful content requests on ${label:resource_name}
info: Rate of requests flagged as harmful by content safety on Cognitive Services resource ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
High rates may indicate abuse or prompt injection attempts
to: sysadmin
template: am_cognitive_services_blocked_content_requests
on: azure_monitor.cognitive_services.content_safety_requests
class: Errors
type: Other
component: Cognitive Services
lookup: average -5m unaligned of blocked
units: requests/s
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (1) : (10))
delay: down 5m multiplier 1.5 max 1h
summary: Blocked content requests on ${label:resource_name}
info: Rate of requests rejected by content safety on Cognitive Services resource ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Content Safety - Abusive Users ---
template: am_cognitive_services_abusive_users
on: azure_monitor.cognitive_services.content_safety_abusive_users
class: Errors
type: Other
component: Cognitive Services
lookup: average -5m unaligned of abusive_users
units: users/s
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (0) : (1))
delay: down 5m multiplier 1.5 max 1h
summary: Abusive users detected on ${label:resource_name}
info: Rate of potentially abusive users detected on Cognitive Services resource ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin