diff --git a/group_vars/all/plain.yml b/group_vars/all/plain.yml index 7d9f174..566f1b4 100644 --- a/group_vars/all/plain.yml +++ b/group_vars/all/plain.yml @@ -153,6 +153,7 @@ connect_admin_email: '{{ devops_email_address }}' keycloak_admin_email: '{{ devops_email_address }}' pgadmin4_admin_email: '{{ devops_email_address }}' harbor_oidc_admin_email: '{{ devops_email_address }}' +grafana_admin_email: '{{ devops_email_address }}' http_port: "80" https_port: "443" diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml index 1b49d8d..96b9716 100644 --- a/roles/prometheus/defaults/main.yml +++ b/roles/prometheus/defaults/main.yml @@ -7,20 +7,20 @@ prometheus_datadir: "{{ prometheus_lvm_hcloudvol_mountpath }}" # https://github.com/prometheus/prometheus prometheus_image_name: "prom/prometheus" -prometheus_image_version: "v2.33.4" +prometheus_image_version: "v2.38.0" # https://github.com/grafana/grafana grafana_image_name: "grafana/grafana" -grafana_image_version: "8.3.5" +grafana_image_version: "9.1.5" # https://github.com/prometheus/alertmanager alertmanager_image_name: "prom/alertmanager" -alertmanager_image_version: "v0.23.0" +alertmanager_image_version: "v0.24.0" # https://github.com/prometheus/blackbox_exporter blackbox_exporter_image_name: "prom/blackbox-exporter" -blackbox_exporter_image_version: "v0.19.0" +blackbox_exporter_image_version: "v0.22.0" # https://github.com/idealista/prom2teams prom2teams_image_name: "idealista/prom2teams" -prom2teams_image_version: "3.2.3" +prom2teams_image_version: "4.0.0" diff --git a/templates/prometheus/config/grafana/conf/defaults.ini.j2 b/templates/prometheus/config/grafana/conf/defaults.ini.j2 index 5b6c965..6162d24 100644 --- a/templates/prometheus/config/grafana/conf/defaults.ini.j2 +++ b/templates/prometheus/config/grafana/conf/defaults.ini.j2 @@ -9,6 +9,9 @@ app_mode = production # instance name, defaults to HOSTNAME environment variable value or hostname if HOSTNAME var is empty instance_name = ${HOSTNAME} +# force migration will run migrations that might cause dataloss +force_migration = false + #################################### Paths ############################### [paths] # Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used) @@ -66,6 +69,13 @@ cert_key = # Unix socket path socket = /tmp/grafana.sock +# CDN Url +cdn_url = + +# Sets the maximum time in minutes before timing out read of an incoming request and closing idle connections. +# `0` means there is no timeout for reading the request. +read_timeout = 0 + #################################### Database ############################ [database] # You can configure the database connection by specifying type, host, name, user and password @@ -98,6 +108,12 @@ log_queries = # For "mysql", use either "true", "false", or "skip-verify". ssl_mode = disable +# Database drivers may support different transaction isolation levels. +# Currently, only "mysql" driver supports isolation levels. +# If the value is empty - driver's default isolation level is applied. +# For "mysql" use "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ" or "SERIALIZABLE". +isolation_level = + ca_cert_path = client_key_path = client_cert_path = @@ -109,6 +125,9 @@ path = grafana.db # For "sqlite3" only. cache mode setting used for connecting to the database cache_mode = private +# For "mysql" only if migrationLocking feature toggle is set. How many seconds to wait before failing to lock the database for the migrations, default is 0. +locking_attempt_timeout_sec = 0 + #################################### Cache server ############################# [remote_cache] # Either "redis", "memcached" or "database" default is "database" @@ -126,13 +145,45 @@ connstr = # This enables data proxy logging, default is false logging = false -# How long the data proxy waits before timing out, default is 30 seconds. +# How long the data proxy waits to read the headers of the response before timing out, default is 30 seconds. # This setting also applies to core backend HTTP data sources where query requests use an HTTP client with timeout set. timeout = 30 -# If enabled and user is not anonymous, data proxy will add X-Grafana-User header with username into the request, default is false. +# How long the data proxy waits to establish a TCP connection before timing out, default is 10 seconds. +dialTimeout = 10 + +# How many seconds the data proxy waits before sending a keepalive request. +keep_alive_seconds = 30 + +# How many seconds the data proxy waits for a successful TLS Handshake before timing out. +tls_handshake_timeout_seconds = 10 + +# How many seconds the data proxy will wait for a server's first response headers after +# fully writing the request headers if the request has an "Expect: 100-continue" +# header. A value of 0 will result in the body being sent immediately, without +# waiting for the server to approve. +expect_continue_timeout_seconds = 1 + +# Optionally limits the total number of connections per host, including connections in the dialing, +# active, and idle states. On limit violation, dials will block. +# A value of zero (0) means no limit. +max_conns_per_host = 0 + +# The maximum number of idle connections that Grafana will keep alive. +max_idle_connections = 100 + +# How many seconds the data proxy keeps an idle connection open before timing out. +idle_conn_timeout_seconds = 90 + +# If enabled and user is not anonymous, data proxy will add X-Grafana-User header with username into the request. send_user_header = false +# Limit the amount of bytes that will be read/accepted from responses of outgoing HTTP requests. +response_limit = 0 + +# Limits the number of rows that Grafana will process from SQL data sources. +row_limit = 1000000 + #################################### Analytics ########################### [analytics] # Server reporting, sends usage counters to stats.grafana.org every 24 hours. @@ -141,19 +192,50 @@ send_user_header = false # Change this option to false to disable reporting. reporting_enabled = true +# The name of the distributor of the Grafana instance. Ex hosted-grafana, grafana-labs +reporting_distributor = grafana-labs + # Set to false to disable all checks to https://grafana.com -# for new versions (grafana itself and plugins), check is used -# in some UI views to notify that grafana or plugin update exists +# for new versions of grafana. The check is used +# in some UI views to notify that a grafana update exists. # This option does not cause any auto updates, nor send any information -# only a GET request to https://grafana.com to get latest versions +# only a GET request to https://raw.githubusercontent.com/grafana/grafana/main/latest.json to get the latest version. check_for_updates = true +# Set to false to disable all checks to https://grafana.com +# for new versions of plugins. The check is used +# in some UI views to notify that a plugin update exists. +# This option does not cause any auto updates, nor send any information +# only a GET request to https://grafana.com to get the latest versions. +check_for_plugin_updates = true + # Google Analytics universal tracking code, only enabled if you specify an id here google_analytics_ua_id = # Google Tag Manager ID, only enabled if you specify an id here google_tag_manager_id = +# Rudderstack write key, enabled only if rudderstack_data_plane_url is also set +rudderstack_write_key = + +# Rudderstack data plane url, enabled only if rudderstack_write_key is also set +rudderstack_data_plane_url = + +# Rudderstack SDK url, optional, only valid if rudderstack_write_key and rudderstack_data_plane_url is also set +rudderstack_sdk_url = + +# Rudderstack Config url, optional, used by Rudderstack SDK to fetch source config +rudderstack_config_url = + +# Application Insights connection string. Specify an URL string to enable this feature. +application_insights_connection_string = + +# Optional. Specifies an Application Insights endpoint URL where the endpoint string is wrapped in backticks ``. +application_insights_endpoint_url = + +# Controls if the UI contains any links to user feedback forms +feedback_links_enabled = true + #################################### Security ############################ [security] # disable creation of admin user on first start of grafana @@ -165,9 +247,18 @@ admin_user = {{ grafana_admin_username }} # default admin password, can be changed before first start of grafana, or in profile settings admin_password = {{ grafana_admin_password }} +# default admin email, created on startup +admin_email = {{ grafana_admin_email }} + # used for signing secret_key = {{ grafana_signing_secret }} +# current key provider used for envelope encryption, default to static value specified by secret_key +encryption_provider = secretKey.v1 + +# list of configured key providers, space separated (Enterprise only): e.g., awskms.v1 azurekv.v1 +available_encryption_providers = + # disable gravatar profile images disable_gravatar = false @@ -187,9 +278,7 @@ cookie_samesite = lax allow_embedding = false # Set to true if you want to enable http strict transport security (HSTS) response header. -# This is only sent when HTTPS is enabled in this configuration. # HSTS tells browsers that the site should only be accessed using HTTPS. -# The default will change to true in the next minor release, 6.3. strict_transport_security = false # Sets how long a browser should cache HSTS. Only applied if strict_transport_security is enabled. @@ -203,20 +292,40 @@ strict_transport_security_subdomains = false # Set to true to enable the X-Content-Type-Options response header. # The X-Content-Type-Options response HTTP header is a marker used by the server to indicate that the MIME types advertised -# in the Content-Type headers should not be changed and be followed. The default will change to true in the next minor release, 6.3. -x_content_type_options = false +# in the Content-Type headers should not be changed and be followed. +x_content_type_options = true # Set to true to enable the X-XSS-Protection header, which tells browsers to stop pages from loading -# when they detect reflected cross-site scripting (XSS) attacks. The default will change to true in the next minor release, 6.3. -x_xss_protection = false +# when they detect reflected cross-site scripting (XSS) attacks. +x_xss_protection = true + +# Enable adding the Content-Security-Policy header to your requests. +# CSP allows to control resources the user agent is allowed to load and helps prevent XSS attacks. +content_security_policy = false + +# Set Content Security Policy template used when adding the Content-Security-Policy header to your requests. +# $NONCE in the template includes a random nonce. +# $ROOT_PATH is server.root_url without the protocol. +content_security_policy_template = """script-src 'self' 'unsafe-eval' 'unsafe-inline' 'strict-dynamic' $NONCE;object-src 'none';font-src 'self';style-src 'self' 'unsafe-inline' blob:;img-src * data:;base-uri 'self';connect-src 'self' grafana.com ws://$ROOT_PATH wss://$ROOT_PATH;manifest-src 'self';media-src 'none';form-action 'self';""" + +# Controls if old angular plugins are supported or not. This will be disabled by default in future release +angular_support_enabled = true + +[security.encryption] +# Defines the time-to-live (TTL) for decrypted data encryption keys stored in memory (cache). +# Please note that small values may cause performance issues due to a high frequency decryption operations. +data_keys_cache_ttl = 15m +# Defines the frequency of data encryption keys cache cleanup interval. +# On every interval, decrypted data encryption keys that reached the TTL are removed from the cache. +data_keys_cache_cleanup_interval = 1m #################################### Snapshots ########################### [snapshots] # snapshot sharing options external_enabled = true -external_snapshot_url = https://snapshots-origin.raintank.io -external_snapshot_name = Publish to snapshot.raintank.io +external_snapshot_url = https://snapshots.raintank.io +external_snapshot_name = Publish to snapshots.raintank.io # Set to true to enable this Grafana instance act as an external snapshot server and allow unauthenticated requests for # creating and deleting snapshots. @@ -238,6 +347,11 @@ min_refresh_interval = 5s # Path to the default home dashboard. If this value is empty, then Grafana uses StaticRootPath + "dashboards/home.json" default_home_dashboard_path = +################################### Data sources ######################### +[datasources] +# Upper limit of data sources that Grafana will return. This limit is a temporary configuration and it will be deprecated when pagination will be introduced on the list data sources API. +datasource_limit = 5000 + #################################### Users ############################### [users] # disable user signup / registration @@ -265,6 +379,12 @@ password_hint = password # Default UI theme ("dark" or "light") default_theme = dark +# Default locale (supported IETF language tag, such as en-US) +default_locale = en-US + +# Path to a custom home page. Users are only redirected to this if the default home dashboard is used. It should match a frontend route and contain a leading slash. +home_page = + # External user management external_manage_link_url = external_manage_link_name = @@ -276,15 +396,21 @@ viewers_can_edit = false # Editors can administrate dashboard, folders and teams they create editors_can_admin = false +# The duration in time a user invitation remains valid before expiring. This setting should be expressed as a duration. Examples: 6h (hours), 2d (days), 1w (week). Default is 24h (24 hours). The minimum supported duration is 15m (15 minutes). +user_invite_max_lifetime_duration = 24h + +# Enter a comma-separated list of usernames to hide them in the Grafana UI. These users are shown to Grafana admins and to themselves. +hidden_users = + [auth] # Login cookie name login_cookie_name = grafana_session -# The lifetime (days) an authenticated user can be inactive before being required to login at next visit. Default is 7 days. -login_maximum_inactive_lifetime_days = 7 +# The maximum lifetime (duration) an authenticated user can be inactive before being required to login at next visit. Default is 7 days (7d). This setting should be expressed as a duration, e.g. 5m (minutes), 6h (hours), 10d (days), 2w (weeks), 1M (month). The lifetime resets at each successful token rotation (token_rotation_interval_minutes). +login_maximum_inactive_lifetime_duration = -# The maximum lifetime (days) an authenticated user can be logged in since login time before being required to login. Default is 30 days. -login_maximum_lifetime_days = 30 +# The maximum lifetime (duration) an authenticated user can be logged in since login time before being required to login. Default is 30 days (30d). This setting should be expressed as a duration, e.g. 5m (minutes), 6h (hours), 10d (days), 2w (weeks), 1M (month). +login_maximum_lifetime_duration = # How often should auth tokens be rotated for authenticated users when being active. The default is each 10 minutes. token_rotation_interval_minutes = 10 @@ -292,7 +418,7 @@ token_rotation_interval_minutes = 10 # Set to true to disable (hide) the login form, useful if you use OAuth disable_login_form = false -# Set to true to disable the signout link in the side menu. useful if you use auth.proxy +# Set to true to disable the sign out link in the side menu. Useful if you use auth.proxy or auth.jwt. disable_signout_menu = false # URL to redirect the user to after sign out @@ -302,12 +428,24 @@ signout_redirect_url = # This setting is ignored if multiple OAuth providers are configured. oauth_auto_login = false -# OAuth state max age cookie duration. Defaults to 60 seconds. -oauth_state_cookie_max_age = 60 +# OAuth state max age cookie duration in seconds. Defaults to 600 seconds. +oauth_state_cookie_max_age = 600 + +# Skip forced assignment of OrgID 1 or 'auto_assign_org_id' for social logins +oauth_skip_org_role_update_sync = false # limit of api_key seconds to live before expiration api_key_max_seconds_to_live = -1 +# Set to true to enable SigV4 authentication option for HTTP-based datasources +sigv4_auth_enabled = false + +# Set to true to enable verbose logging of SigV4 request signing +sigv4_verbose_logging = false + +# Set to true to enable Azure authentication option for HTTP-based datasources +azure_auth_enabled = false + #################################### Anonymous Auth ###################### [auth.anonymous] # enable anonymous access @@ -319,12 +457,15 @@ org_name = Main Org. # specify role for unauthenticated users org_role = Viewer -#################################### Github Auth ######################### +# mask the Grafana version number for unauthenticated users +hide_version = false + +#################################### GitHub Auth ######################### [auth.github] enabled = false allow_sign_up = true client_id = some_id -client_secret = some_secret +client_secret = scopes = user:email,read:org auth_url = https://github.com/login/oauth/authorize token_url = https://github.com/login/oauth/access_token @@ -332,26 +473,32 @@ api_url = https://api.github.com/user allowed_domains = team_ids = allowed_organizations = +role_attribute_path = +role_attribute_strict = false +allow_assign_grafana_admin = false #################################### GitLab Auth ######################### [auth.gitlab] enabled = false allow_sign_up = true client_id = some_id -client_secret = some_secret +client_secret = scopes = api auth_url = https://gitlab.com/oauth/authorize token_url = https://gitlab.com/oauth/token api_url = https://gitlab.com/api/v4 allowed_domains = allowed_groups = +role_attribute_path = +role_attribute_strict = false +allow_assign_grafana_admin = false #################################### Google Auth ######################### [auth.google] enabled = false allow_sign_up = true client_id = some_client_id -client_secret = some_client_secret +client_secret = scopes = https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email auth_url = https://accounts.google.com/o/oauth2/auth token_url = https://accounts.google.com/o/oauth2/token @@ -365,7 +512,7 @@ hosted_domain = enabled = false allow_sign_up = true client_id = some_id -client_secret = some_secret +client_secret = scopes = user:email allowed_organizations = @@ -373,7 +520,7 @@ allowed_organizations = enabled = false allow_sign_up = true client_id = some_id -client_secret = some_secret +client_secret = scopes = user:email allowed_organizations = @@ -383,20 +530,23 @@ name = Azure AD enabled = false allow_sign_up = true client_id = some_client_id -client_secret = some_client_secret +client_secret = scopes = openid email profile auth_url = https://login.microsoftonline.com//oauth2/v2.0/authorize token_url = https://login.microsoftonline.com//oauth2/v2.0/token allowed_domains = allowed_groups = +role_attribute_strict = false +allow_assign_grafana_admin = false #################################### Okta OAuth ####################### [auth.okta] name = Okta +icon = okta enabled = false allow_sign_up = true client_id = some_id -client_secret = some_secret +client_secret = scopes = openid profile email groups auth_url = https://.okta.com/oauth2/v1/authorize token_url = https://.okta.com/oauth2/v1/token @@ -404,21 +554,32 @@ api_url = https://.okta.com/oauth2/v1/userinfo allowed_domains = allowed_groups = role_attribute_path = +role_attribute_strict = false +allow_assign_grafana_admin = false #################################### Generic OAuth ####################### [auth.generic_oauth] name = OAuth +icon = signin enabled = false allow_sign_up = true client_id = some_id -client_secret = some_secret +client_secret = scopes = user:email +empty_scopes = false email_attribute_name = email:primary email_attribute_path = +login_attribute_path = +name_attribute_path = role_attribute_path = +role_attribute_strict = false +groups_attribute_path = +id_token_attribute_name = +team_ids_attribute_path = auth_url = token_url = api_url = +teams_url = allowed_domains = team_ids = allowed_organizations = @@ -426,6 +587,9 @@ tls_skip_verify_insecure = false tls_client_cert = tls_client_key = tls_client_ca = +use_pkce = false +auth_style = +allow_assign_grafana_admin = false #################################### Basic Auth ########################## [auth.basic] @@ -437,12 +601,28 @@ enabled = false header_name = X-WEBAUTH-USER header_property = username auto_sign_up = true -# Deprecated, use sync_ttl instead -ldap_sync_ttl = 60 sync_ttl = 60 whitelist = headers = +headers_encoded = false +enable_login_token = false + +#################################### Auth JWT ########################## +[auth.jwt] +enabled = false enable_login_token = false +header_name = +email_claim = +username_claim = +jwk_set_url = +jwk_set_file = +cache_ttl = 60m +expected_claims = {} +key_file = +role_attribute_path = +role_attribute_strict = false +auto_sign_up = false +allow_assign_grafana_admin = false #################################### Auth LDAP ########################### [auth.ldap] @@ -450,28 +630,64 @@ enabled = false config_file = /etc/grafana/ldap.toml allow_sign_up = true -# LDAP backround sync (Enterprise only) +# LDAP background sync (Enterprise only) # At 1 am every day -sync_cron = "0 0 1 * * *" +sync_cron = "0 1 * * *" active_sync_enabled = true +#################################### AWS ########################### +[aws] +# Enter a comma-separated list of allowed AWS authentication providers. +# Options are: default (AWS SDK Default), keys (Access && secret key), credentials (Credentials field), ec2_iam_role (EC2 IAM Role) +allowed_auth_providers = default,keys,credentials + +# Allow AWS users to assume a role using temporary security credentials. +# If true, assume role will be enabled for all AWS authentication providers that are specified in aws_auth_providers +assume_role_enabled = true + +# Specify max no of pages to be returned by the ListMetricPages API +list_metrics_page_limit = 500 + +#################################### Azure ############################### +[azure] +# Azure cloud environment where Grafana is hosted +# Possible values are AzureCloud, AzureChinaCloud, AzureUSGovernment and AzureGermanCloud +# Default value is AzureCloud (i.e. public cloud) +cloud = AzureCloud + +# Specifies whether Grafana hosted in Azure service with Managed Identity configured (e.g. Azure Virtual Machines instance) +# If enabled, the managed identity can be used for authentication of Grafana in Azure services +# Disabled by default, needs to be explicitly enabled +managed_identity_enabled = false + +# Client ID to use for user-assigned managed identity +# Should be set for user-assigned identity and should be empty for system-assigned identity +managed_identity_client_id = + +#################################### Role-based Access Control ########### +[rbac] +# If enabled, cache permissions in a in memory cache +permission_cache = true + #################################### SMTP / Emailing ##################### [smtp] enabled = true -host = smtp.tolina.local:25 +host = {{ shared_service_mail_hostname }}:25 user = # If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;""" password = cert_file = key_file = skip_verify = true -from_address = admin@grafana-qa.smardigo.digital -from_name = smardigo +from_address = {{ grafana_admin_email }} +from_name = Grafana ehlo_identity = +startTLS_policy = [emails] welcome_email_on_sign_up = false -templates_pattern = emails/*.html +templates_pattern = emails/*.html, emails/*.txt +content_types = text/html #################################### Logging ########################## [log] @@ -530,6 +746,40 @@ facility = # Syslog tag. By default, the process' argv[0] is used. tag = +[log.frontend] +# Should Sentry javascript agent be initialized +enabled = false + +# Defines which provider to use sentry or grafana +provider = sentry + +# Sentry DSN if you want to send events to Sentry. +sentry_dsn = + +# Custom HTTP endpoint to send events to. Default will log the events to stdout. +custom_endpoint = + +# Rate of events to be reported to Sentry between 0 (none) and 1 (all), float +sample_rate = 1.0 + +# Requests per second limit enforced per an extended period, for Grafana backend log ingestion endpoint (/log). +log_endpoint_requests_per_second_limit = 3 + +# Max requests accepted per short interval of time for Grafana backend log ingestion endpoint (/log) +log_endpoint_burst_limit = 15 + +# Should error instrumentation be enabled, only affects Grafana Javascript Agent +instrumentations_errors_enabled = true + +# Should console instrumentation be enabled, only affects Grafana Javascript Agent +instrumentations_console_enabled = false + +# Should webvitals instrumentation be enabled, only affects Grafana Javascript Agent +instrumentations_webvitals_enabled = false + +# Api Key, only applies to Grafana Javascript Agent provider +api_key = + #################################### Usage Quotas ######################## [quota] enabled = false @@ -547,6 +797,9 @@ org_data_source = 10 # limit number of api_keys per Org. org_api_key = 10 +# limit number of alerts per Org. +org_alert_rule = 100 + # limit number of orgs a user can create. user_org = 10 @@ -565,11 +818,94 @@ global_api_key = -1 # global limit on number of logged in users. global_session = -1 +# global limit of alerts +global_alert_rule = -1 + +# global limit of files uploaded to the SQL DB +global_file = 1000 + +#################################### Unified Alerting #################### +[unified_alerting] +# Enable the Unified Alerting sub-system and interface. When enabled we'll migrate all of your alert rules and notification channels to the new system. New alert rules will be created and your notification channels will be converted into an Alertmanager configuration. Previous data is preserved to enable backwards compatibility but new data is removed when switching. When this configuration section and flag are not defined, the state is defined at runtime. See the documentation for more details. +enabled = + +# Comma-separated list of organization IDs for which to disable unified alerting. Only supported if unified alerting is enabled. +disabled_orgs = + +# Specify the frequency of polling for admin config changes. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +admin_config_poll_interval = 60s + +# Specify the frequency of polling for Alertmanager config changes. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +alertmanager_config_poll_interval = 60s + +# Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port. +ha_listen_address = "0.0.0.0:9094" + +# Explicit address/hostname and port to advertise other Grafana instances. The port is used for both TCP and UDP. +ha_advertise_address = "" + +# Comma-separated list of initial instances (in a format of host:port) that will form the HA cluster. Configuring this setting will enable High Availability mode for alerting. +ha_peers = "" + +# Time to wait for an instance to send a notification via the Alertmanager. In HA, each Grafana instance will +# be assigned a position (e.g. 0, 1). We then multiply this position with the timeout to indicate how long should +# each instance wait before sending the notification to take into account replication lag. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +ha_peer_timeout = 15s + +# The interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated +# across cluster more quickly at the expense of increased bandwidth usage. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +ha_gossip_interval = 200ms + +# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds +# across larger clusters at the expense of increased bandwidth usage. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +ha_push_pull_interval = 60s + +# Enable or disable alerting rule execution. The alerting UI remains visible. This option has a legacy version in the `[alerting]` section that takes precedence. +execute_alerts = true + +# Alert evaluation timeout when fetching data from the datasource. This option has a legacy version in the `[alerting]` section that takes precedence. +# The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +evaluation_timeout = 30s + +# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. This option has a legacy version in the `[alerting]` section that takes precedence. +max_attempts = 3 + +# Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time. This option has a legacy version in the `[alerting]` section that takes precedence. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +min_interval = 10s + +[unified_alerting.screenshots] +# Enable screenshots in notifications. This option requires the Grafana Image Renderer plugin. +# For more information on configuration options, refer to [rendering]. +capture = false + +# The maximum number of screenshots that can be taken at the same time. This option is different from +# concurrent_render_request_limit as max_concurrent_screenshots sets the number of concurrent screenshots +# that can be taken at the same time for all firing alerts where as concurrent_render_request_limit sets +# the total number of concurrent screenshots across all Grafana services. +max_concurrent_screenshots = 5 + +# Uploads screenshots to the local Grafana server or remote storage such as Azure, S3 and GCS. Please +# see [external_image_storage] for further configuration options. If this option is false then +# screenshots will be persisted to disk for up to temp_data_lifetime. +upload_external_image_storage = false + +[unified_alerting.reserved_labels] +# Comma-separated list of reserved labels added by the Grafana Alerting engine that should be disabled. +# For example: `disabled_labels=grafana_folder` +disabled_labels = + #################################### Alerting ############################ [alerting] -# Disable alerting engine & UI features +# Enable the legacy alerting sub-system and interface. If Unified Alerting is already enabled and you try to go back to legacy alerting, all data that is part of Unified Alerting will be deleted. When this configuration section and flag are not defined, the state is defined at runtime. See the documentation for more details. enabled = true -# Makes it possible to turn off alert rule execution but alerting UI is visible + +# Makes it possible to turn off alert execution but alerting UI is visible execute_alerts = true # Default setting for new alert rules. Defaults to categorize error and timeouts as alerting. (alerting, keep_state) @@ -594,23 +930,77 @@ max_attempts = 3 # Makes it possible to enforce a minimal interval between evaluations, to reduce load on the backend min_interval_seconds = 1 +# Configures for how long alert annotations are stored. Default is 0, which keeps them forever. +# This setting should be expressed as an duration. Ex 6h (hours), 10d (days), 2w (weeks), 1M (month). +max_annotation_age = + +# Configures max number of alert annotations that Grafana stores. Default value is 0, which keeps all alert annotations. +max_annotations_to_keep = + +#################################### Annotations ######################### +[annotations] +# Configures the batch size for the annotation clean-up job. This setting is used for dashboard, API, and alert annotations. +cleanupjob_batchsize = 100 + +[annotations.dashboard] +# Dashboard annotations means that annotations are associated with the dashboard they are created on. + +# Configures how long dashboard annotations are stored. Default is 0, which keeps them forever. +# This setting should be expressed as a duration. Examples: 6h (hours), 10d (days), 2w (weeks), 1M (month). +max_age = + +# Configures max number of dashboard annotations that Grafana stores. Default value is 0, which keeps all dashboard annotations. +max_annotations_to_keep = + +[annotations.api] +# API annotations means that the annotations have been created using the API without any +# association with a dashboard. + +# Configures how long Grafana stores API annotations. Default is 0, which keeps them forever. +# This setting should be expressed as a duration. Examples: 6h (hours), 10d (days), 2w (weeks), 1M (month). +max_age = + +# Configures max number of API annotations that Grafana keeps. Default value is 0, which keeps all API annotations. +max_annotations_to_keep = + #################################### Explore ############################# [explore] # Enable the Explore section enabled = true +#################################### Help ############################# +[help] +# Enable the Help section +enabled = true + +#################################### Profile ############################# +[profile] +# Enable the Profile section +enabled = true + +#################################### Query History ############################# +[query_history] +# Enable the Query history +enabled = true + #################################### Internal Grafana Metrics ############ -# Metrics available at HTTP API Url /metrics +# Metrics available at HTTP URL /metrics and /metrics/plugins/:pluginId [metrics] enabled = true interval_seconds = 10 # Disable total stats (stat_totals_*) metrics to be generated disable_total_stats = false -#If both are set, basic auth will be required for the metrics endpoint. +#If both are set, basic auth will be required for the metrics endpoints. basic_auth_username = basic_auth_password = +# Metrics environment info adds dimensions to the `grafana_environment_info` metric, which +# can expose more information about the Grafana instance. +[metrics.environment_info] +#exampleLabel1 = exampleValue1 +#exampleLabel2 = exampleValue2 + # Send internal Grafana metrics to graphite [metrics.graphite] # Enable by setting the address setting (ex localhost:2003) @@ -625,6 +1015,7 @@ url = https://grafana.com url = https://grafana.com #################################### Distributed tracing ############ +# Opentracing is deprecated use opentelemetry instead [tracing.jaeger] # jaeger destination (ex localhost:6831) address = @@ -640,12 +1031,32 @@ sampler_type = const # and indicates the initial sampling rate before the actual one # is received from the mothership sampler_param = 1 +# sampling_server_url is the URL of a sampling manager providing a sampling strategy. +sampling_server_url = # Whether or not to use Zipkin span propagation (x-b3- HTTP headers). zipkin_propagation = false # Setting this to true disables shared RPC spans. # Not disabling is the most common setting when using Zipkin elsewhere in your infrastructure. disable_shared_zipkin_spans = false +[tracing.opentelemetry] + +# attributes that will always be included in when creating new spans. ex (key1:value1,key2:value2) +custom_attributes = + +[tracing.opentelemetry.jaeger] +# jaeger destination (ex http://localhost:14268/api/traces) +address = +# Propagation specifies the text map propagation format: w3c, jaeger +propagation = + +# This is a configuration for OTLP exporter with GRPC protocol +[tracing.opentelemetry.otlp] +# otlp destination (ex localhost:4317) +address = +# Propagation specifies the text map propagation format: w3c, jaeger +propagation = + #################################### External Image Storage ############## [external_image_storage] # Used for uploading images to public servers so they can be included in slack/email messages. @@ -672,6 +1083,8 @@ public_url = key_file = bucket = path = +enable_signed_urls = false +signed_url_expiration = [external_image_storage.azure_blob] account_name = @@ -687,6 +1100,8 @@ container_name = server_url = # If the remote HTTP image renderer service runs on a different server than the Grafana server you may have to configure this to a URL where Grafana is reachable, e.g. http://grafana.domain/. callback_url = +# An auth token that will be sent to and verified by the renderer. The renderer will deny any request without an auth token matching the one configured on the renderer side. +renderer_token = - # Concurrent render request limit affects when the /render HTTP endpoint is used. Rendering many images at the same time can overload the server, # which this setting can help protect against by only allowing a certain amount of concurrent requests. concurrent_render_request_limit = 30 @@ -699,8 +1114,36 @@ disable_sanitize_html = false [plugins] enable_alpha = false app_tls_skip_verify_insecure = false -# Enter a comma-separated list of plugin identifiers to identify plugins that are allowed to be loaded even if they lack a valid signature. +# Enter a comma-separated list of plugin identifiers to identify plugins to load even if they are unsigned. Plugins with modified signatures are never loaded. allow_loading_unsigned_plugins = +# Enable or disable installing / uninstalling / updating plugins directly from within Grafana. +plugin_admin_enabled = true +plugin_admin_external_manage_enabled = false +plugin_catalog_url = https://grafana.com/grafana/plugins/ +# Enter a comma-separated list of plugin identifiers to hide in the plugin catalog. +plugin_catalog_hidden_plugins = + +#################################### Grafana Live ########################################## +[live] +# max_connections to Grafana Live WebSocket endpoint per Grafana server instance. See Grafana Live docs +# if you are planning to make it higher than default 100 since this can require some OS and infrastructure +# tuning. 0 disables Live, -1 means unlimited connections. +max_connections = 100 + +# allowed_origins is a comma-separated list of origins that can establish connection with Grafana Live. +# If not set then origin will be matched over root_url. Supports wildcard symbol "*". +allowed_origins = + +# engine defines an HA (high availability) engine to use for Grafana Live. By default no engine used - in +# this case Live features work only on a single Grafana server. +# Available options: "redis". +# Setting ha_engine is an EXPERIMENTAL feature. +ha_engine = + +# ha_engine_address sets a connection address for Live HA engine. Depending on engine type address format can differ. +# For now we only support Redis connection address in "host:port" format. +# This option is EXPERIMENTAL. +ha_engine_address = "127.0.0.1:6379" #################################### Grafana Image Renderer Plugin ########################## [plugin.grafana-image-renderer] @@ -745,14 +1188,16 @@ rendering_chrome_bin = # Mode 'reusable' will have one browser instance and will create a new incognito page on each request. rendering_mode = -# When rendering_mode = clustered you can instruct how many browsers or incognito pages can execute concurrently. Default is 'browser' +# When rendering_mode = clustered, you can instruct how many browsers or incognito pages can execute concurrently. Default is 'browser' # and will cluster using browser instances. # Mode 'context' will cluster using incognito pages. rendering_clustering_mode = -# When rendering_mode = clustered you can define maximum number of browser instances/incognito pages that can execute concurrently.. +# When rendering_mode = clustered, you can define the maximum number of browser instances/incognito pages that can execute concurrently. Default is '5'. rendering_clustering_max_concurrency = +# When rendering_mode = clustered, you can specify the duration a rendering request can take before it will time out. Default is `30` seconds. +rendering_clustering_timeout = -# Limit the maxiumum viewport width, height and device scale factor that can be requested. +# Limit the maximum viewport width, height and device scale factor that can be requested. rendering_viewport_max_width = rendering_viewport_max_height = rendering_viewport_max_device_scale_factor = @@ -766,5 +1211,85 @@ grpc_port = license_path = [feature_toggles] -# enable features, separated by spaces +# there are currently two ways to enable feature toggles in the `grafana.ini`. +# you can either pass an array of feature you want to enable to the `enable` field or +# configure each toggle by setting the name of the toggle to true/false. Toggles set to true/false +# will take precedence over toggles in the `enable` list. + +# enable = feature1,feature2 enable = + +# The new prometheus visual query builder +promQueryBuilder = true + +# The new loki visual query builder +lokiQueryBuilder = true + +# Experimental Explore to Dashboard workflow +explore2Dashboard = true + +# Command Palette +commandPalette = true + +# Use dynamic labels in CloudWatch datasource +cloudWatchDynamicLabels = true + +# feature1 = true +# feature2 = false + +[date_formats] +# For information on what formatting patterns that are supported https://momentjs.com/docs/#/displaying/ + +# Default system date format used in time range picker and other places where full time is displayed +full_date = YYYY-MM-DD HH:mm:ss + +# Used by graph and other places where we only show small intervals +interval_second = HH:mm:ss +interval_minute = HH:mm +interval_hour = MM/DD HH:mm +interval_day = MM/DD +interval_month = YYYY-MM +interval_year = YYYY + +# Experimental feature +use_browser_locale = false + +# Default timezone for user preferences. Options are 'browser' for the browser local timezone or a timezone name from IANA Time Zone database, e.g. 'UTC' or 'Europe/Amsterdam' etc. +default_timezone = browser + +[expressions] +# Enable or disable the expressions functionality. +enabled = true + +[geomap] +# Set the JSON configuration for the default basemap +default_baselayer_config = + +# Enable or disable loading other base map layers +enable_custom_baselayers = true + +#################################### Dashboard previews ##################################### + +[dashboard_previews.crawler] +# Number of dashboards rendered in parallel. Default is 6. +thread_count = + +# Timeout passed down to the Image Renderer plugin. It is used in two separate places within a single rendering request: +# First during the initial navigation to the dashboard and then when waiting for all the panels to load. Default is 20s. +# This setting should be expressed as a duration. Examples: 10s (seconds), 1m (minutes). +rendering_timeout = + +# Maximum duration of a single crawl. Default is 1h. +# This setting should be expressed as a duration. Examples: 10s (seconds), 1m (minutes). +max_crawl_duration = + +# Minimum interval between two subsequent scheduler runs. Default is 12h. +# This setting should be expressed as a duration. Examples: 10s (seconds), 1m (minutes). +scheduler_interval = + + +#################################### Storage ################################################ + +[storage] +# Allow uploading SVG files without sanitization. +allow_unsanitized_svg_upload = false \ No newline at end of file diff --git a/templates/prometheus/config/grafana/provisioning/dashboards/Alertmanager.json b/templates/prometheus/config/grafana/provisioning/dashboards/Alertmanager.json new file mode 100644 index 0000000..cf65d71 --- /dev/null +++ b/templates/prometheus/config/grafana/provisioning/dashboards/Alertmanager.json @@ -0,0 +1,3578 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "uid": "$datasource" + }, + "enable": true, + "expr": "changes(process_start_time_seconds{ instance=~\"$instance\"}[2m]) > 0", + "hide": false, + "iconColor": "#bf1b00", + "name": "Restarts", + "showIn": 0, + "step": "1m", + "tagKeys": "instance", + "titleFormat": "Restart" + } + ] + }, + "description": "Dashboard showing Prometheus Alertmanager metrics for observing status of the cluster and possible debbuging.", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 9578, + "graphTooltip": 1, + "id": 33, + "links": [ + { + "icon": "doc", + "tags": [], + "targetBlank": true, + "title": "Docs", + "tooltip": "Official documentation of Alertmanager", + "type": "link", + "url": "https://prometheus.io/docs/alerting/alertmanager/" + }, + { + "icon": "info", + "tags": [], + "targetBlank": true, + "title": "GitHub", + "tooltip": "Alertmanager sources on GitHub", + "type": "link", + "url": "https://github.com/prometheus/alertmanager" + }, + { + "icon": "info", + "tags": [], + "targetBlank": true, + "title": "Twitter", + "tooltip": "Twitter account with prometheus related info", + "type": "link", + "url": "https://twitter.com/PrometheusIO" + }, + { + "icon": "question", + "tags": [], + "targetBlank": true, + "title": "Mailing list", + "tooltip": "Prometheus users mailing list", + "type": "link", + "url": "https://groups.google.com/forum/#!forum/prometheus-users" + }, + { + "icon": "question", + "tags": [], + "targetBlank": true, + "title": "IRC", + "tooltip": "Join IRC using Riot", + "type": "link", + "url": "https://riot.im/app/#/room/#prometheus:matrix.org" + } + ], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 36, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "refId": "A" + } + ], + "title": "General info", + "type": "row" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "#f4d598", + "mode": "fixed" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 4, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.5", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "count(alertmanager_build_info{instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Number of instances", + "type": "stat" + }, + { + "columns": [], + "datasource": { + "uid": "$datasource" + }, + "description": "Table containing list of Alertmanager instances showing it's version, up time, last reload time and if it was successful.", + "fontSize": "90%", + "gridPos": { + "h": 5, + "w": 9, + "x": 3, + "y": 1 + }, + "id": 26, + "links": [], + "scroll": true, + "showHeader": true, + "sort": { + "col": 13, + "desc": false + }, + "styles": [ + { + "alias": "Time", + "align": "auto", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Instance", + "align": "auto", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "instance", + "thresholds": [], + "type": "string", + "unit": "short" + }, + { + "alias": "Version", + "align": "auto", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "version", + "thresholds": [], + "type": "string", + "unit": "short" + }, + { + "alias": "Up time", + "align": "auto", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 0, + "mappingType": 1, + "pattern": "Value #A", + "thresholds": [], + "type": "number", + "unit": "s" + }, + { + "alias": "Last reload", + "align": "auto", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 0, + "mappingType": 1, + "pattern": "Value #B", + "thresholds": [], + "type": "number", + "unit": "s" + }, + { + "alias": "Last reload sucessfull", + "align": "auto", + "colorMode": "cell", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value #C", + "thresholds": [ + "0", + "1" + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "align": "auto", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "/.*/", + "thresholds": [], + "type": "hidden", + "unit": "short" + } + ], + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "time() - (alertmanager_build_info{instance=~\"$instance\"} * on (instance, cluster) group_left process_start_time_seconds{instance=~\"$instance\"})", + "format": "table", + "instant": true, + "intervalFactor": 1, + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "time() - alertmanager_config_last_reload_success_timestamp_seconds{instance=~\"$instance\"}", + "format": "table", + "instant": true, + "intervalFactor": 1, + "refId": "B" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "alertmanager_config_last_reload_successful{instance=~\"$instance\"}", + "format": "table", + "instant": true, + "intervalFactor": 1, + "refId": "C" + } + ], + "title": "Instance versions and up time", + "transform": "table", + "type": "table-old" + }, + { + "datasource": { + "uid": "$datasource" + }, + "description": "Number of peers in the Alertmanager cluster.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "#e5ac0e", + "mode": "fixed" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 207, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.5", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "max(alertmanager_cluster_members{instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Cluster size", + "type": "stat" + }, + { + "datasource": { + "uid": "$datasource" + }, + "description": "Current number of active alerts.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "#bf1b00", + "mode": "fixed" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 15, + "y": 1 + }, + "id": 2, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.5", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "max(alertmanager_alerts{state=\"active\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Number of active alerts", + "type": "stat" + }, + { + "datasource": { + "uid": "$datasource" + }, + "description": "Current number of suppressed alerts.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "#f9e2d2", + "mode": "fixed" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 18, + "y": 1 + }, + "id": 3, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.5", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "max(alertmanager_alerts{state=\"suppressed\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Number of suppressed alerts", + "type": "stat" + }, + { + "datasource": { + "uid": "$datasource" + }, + "description": "Current number of active silences.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "#f9e2d2", + "mode": "fixed" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 21, + "y": 1 + }, + "id": 121, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.5", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "max(alertmanager_silences{state=\"active\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Number of active silences", + "type": "stat" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 6 + }, + "id": 113, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "refId": "A" + } + ], + "title": "Notifications", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "description": "Number of sent notifications to distinct integrations such as PagerDuty, Slack and so on. On negative axis are displayed failed notifications.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 7 + }, + "hiddenSeries": false, + "id": 118, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "total", + "sortDesc": true, + "total": true, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.1.5", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "/Failed.*/", + "color": "#99440a", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "sum(increase(alertmanager_notifications_total{instance=~\"$instance\"}[$__interval])) by (integration)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ integration}}", + "refId": "B" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "sum(increase(alertmanager_notifications_failed_total{instance=~\"$instance\"}[$__interval])) by (integration)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Failed {{ integration }}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Notifications sent from $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "description": "Duration of notification sends in 0.99 and 0.9 quantiles per integration.", + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 12 + }, + "hiddenSeries": false, + "id": 115, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.1.5", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "/0.99.*/", + "linewidth": 1 + }, + { + "alias": "/0.5 .*/", + "linewidth": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "sum(histogram_quantile(0.9,rate(alertmanager_notification_latency_seconds_bucket{instance=~\"$instance\"}[$__interval]))) by (integration)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "0.9q {{ integration }}", + "refId": "B" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "sum(histogram_quantile(0.99,rate(alertmanager_notification_latency_seconds_bucket{instance=~\"$instance\"}[$__interval]))) by (integration)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "0.99q {{ integration }}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Notification durations per integration on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 18, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "refId": "A" + } + ], + "title": "Alerts", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "description": "Number of alerts by state such as `active`, `suppressed` etc.", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 19 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.1.5", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "active", + "color": "#bf1b00" + }, + { + "alias": "suppressed", + "color": "#2f575e" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "sum(alertmanager_alerts{instance=~\"$instance\"}) by (state)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{state}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Active alerts in $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "description": "Number of received alerts from Prometheus by status `firing` on positive axis and `resolved` on negative axis.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 24 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.1.5", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "resolved", + "color": "#7eb26d", + "transform": "negative-Y" + }, + { + "alias": "firing", + "color": "#99440a" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "sum(increase(alertmanager_alerts_received_total{instance=~\"$instance\"}[$__interval])) by (status)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ status }}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Received alerts by status for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 34, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "description": "Shows cluster score representing cluster health. From Hashicorps official documentation: \n> This metric describes a node's perception of its own health based on how well it is meeting the soft real-time requirements of the protocol. This metric ranges from 0 to 8, where 0 indicates \"totally healthy\".\n\nFor more info see https://www.consul.io/docs/agent/telemetry.html#cluster-health", + "fill": 1, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 20 + }, + "id": 57, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Number of failed peers", + "color": "#bf1b00", + "fill": 7 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "alertmanager_cluster_health_score{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster health score", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Clusterhealth score for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "description": "Shows gossip cluster members count in time and failing peers in case of any in red color.", + "fill": 1, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 24 + }, + "id": 38, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Number of failed peers", + "color": "#bf1b00", + "fill": 7 + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "alertmanager_cluster_members{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Number of cluster members", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "alertmanager_cluster_failed_peers{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Number of failed peers", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Cluster members count on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "description": "On positive axis shows number of peers that joined the cluster and on negative axis number of peers that left the cluster.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 28 + }, + "id": 75, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Cluster left peers", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "increase(alertmanager_cluster_peers_left_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster joined peers", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "increase(alertmanager_cluster_peers_joined_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster left peers", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Cluster peers left/joined on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "description": "On positive axis is number of attempts to reconnect the cluster. On negative axis if number of failed attempts.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 34 + }, + "id": 68, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Failed reconnections", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "increase(alertmanager_cluster_reconnections_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Sucessful reconnections", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "increase(alertmanager_cluster_reconnections_failed_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Failed reconnections", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Cluster reconnections on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "description": "On positive axis is number of sent cluster messages by type `update` or `full_state` and on negative axis the same for received messages.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 40 + }, + "id": 48, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "/received.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*full_state.*/", + "color": "#629e51" + }, + { + "alias": "/.*update.*/", + "color": "#f4d598" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "sum(increase(alertmanager_cluster_messages_sent_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "sent {{msg_type}}", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "sum(increase(alertmanager_cluster_messages_received_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "received {{msg_type}}", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Cluster messages count on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "description": "On positive axis is size of sent cluster messages by type `update` or `full_state` and on negative axis the same for received messages.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 46 + }, + "id": 53, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "/received.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*full_state.*/", + "color": "#629e51" + }, + { + "alias": "/.*update.*/", + "color": "#f4d598" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "sum(increase(alertmanager_cluster_messages_sent_size_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "sent {{msg_type}}", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "sum(increase(alertmanager_cluster_messages_received_size_total{instance=~\"$instance\"}[$__interval])) by (msg_type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "received {{msg_type}}", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Cluster messages size on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "description": "On positive axis is number of queued cluster messages and on negative axis number of pruned messages.", + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 52 + }, + "id": 62, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Pruned messaged", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "alertmanager_cluster_messages_pruned_total{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pruned messaged", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "alertmanager_cluster_messages_queued{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Queued messages", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Cluster messages queue on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "refId": "A" + } + ], + "title": "Cluster members", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 284, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 10 + }, + "id": 314, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "/dropped/", + "color": "#cca300", + "transform": "negative-Y" + }, + { + "alias": "/failed/", + "color": "#bf1b00", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "increase(alertmanager_oversized_gossip_message_sent_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{key}}", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "increase(alertmanager_oversized_gossip_message_dropped_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "dropped {{key}}", + "refId": "B" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "increase(alertmanager_oversized_gossip_message_failure_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "failed {{key}}", + "refId": "C" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Count of oversized gossip messages on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 16 + }, + "id": 307, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "histogram_quantile(1,rate(alertmanager_oversize_gossip_message_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{key}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Duration of oversized gossip messages on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 22 + }, + "id": 303, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "increase(alertmanager_silences_gossip_messages_propagated_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "silences", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "increase(alertmanager_nflog_gossip_messages_propagated_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "nf_log", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Number of propagated gossip messages on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "refId": "A" + } + ], + "title": "Gossip messages", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 84, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 11 + }, + "id": 94, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "increase(alertmanager_nflog_queries_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log query count", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "increase(alertmanager_nflog_query_errors_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log query errors", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Nf log queries count for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 17 + }, + "id": 106, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "histogram_quantile(1,rate(alertmanager_nflog_query_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log query duration", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Nf log query duration for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 23 + }, + "id": 97, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "alertmanager_nflog_snapshot_size_bytes{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nf log snapshot size", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Nf log snapshot size for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 29 + }, + "id": 101, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "rate(alertmanager_nflog_snapshot_duration_seconds_sum{instance=~\"$instance\"}[$__interval]) / rate(alertmanager_nflog_snapshot_duration_seconds_sum{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Nf log snapshot size", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Nf log snapshot duration for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 35 + }, + "id": 92, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Cluster left peers", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "alertmanager_nflog_gc_duration_seconds{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Cluster joined peers", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Nf log Go GC time for $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "refId": "A" + } + ], + "title": "Nflog", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 123, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 12 + }, + "id": 129, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "alertmanager_silences{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{state}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Silences count by state on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 18 + }, + "id": 134, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Silecnces query fails", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "increase(alertmanager_silences_queries_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces query count", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "increase(alertmanager_silences_query_errors_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Silecnces query fails", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Silences query count on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 24 + }, + "id": 138, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "histogram_quantile(1,rate(alertmanager_silences_query_duration_seconds_bucket{instance=~\"$instance\"}[$__interval]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces query duration", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Silences query duration on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 30 + }, + "id": 149, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "alertmanager_silences_snapshot_size_bytes{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces snapshot size", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Silences snapshot size on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 36 + }, + "id": 143, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "alertmanager_silences_snapshot_duration_seconds{instance=~\"$instance\", quantile=\"0.99\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces snapshot duration", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Silences snapshot duration on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fill": 1, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 42 + }, + "id": 131, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Nf log query errors", + "color": "#890f02", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "alertmanager_silences_gc_duration_seconds{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Silecnces GC duration", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Silences GC duraton on $instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "refId": "A" + } + ], + "title": "Silences", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 173, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "decimals": 2, + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 13 + }, + "id": 175, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "/Limit .*/", + "color": "#C15C17", + "dashes": true, + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "rate(process_cpu_seconds_total{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "max(kube_pod_container_resource_limits_cpu_cores{pod=~\"$instance\"}) by (pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limit {{pod}}", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "CPU usage/s for $instance", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 20 + }, + "id": 177, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "instance", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "/Limit .*/", + "color": "#C15C17", + "dashes": true, + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "process_resident_memory_bytes{instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ instance }}", + "refId": "E" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "max(kube_pod_container_resource_limits_memory_bytes{pod=~\"$instance\"}) by (pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limit {{ pod }}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Memory usage for $instance", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "refId": "A" + } + ], + "title": "Resources", + "type": "row" + } + ], + "refresh": "5m", + "schemaVersion": 37, + "style": "dark", + "tags": [ + "alertmanager", + "prometheus", + "alerting" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "hide": 0, + "includeAll": false, + "label": "Prometheus datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": "", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "definition": "query_result(alertmanager_build_info)", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": true, + "name": "instance", + "options": [], + "query": { + "query": "query_result(alertmanager_build_info)", + "refId": "Prometheus-instance-Variable-Query" + }, + "refresh": 2, + "regex": "/.*instance=\"([^\"]+)\".*/", + "skipUrlSync": false, + "sort": 1, + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-12h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Alertmanager", + "uid": "eea-9_sik", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/templates/prometheus/config/prometheus/prometheus.yml.j2 b/templates/prometheus/config/prometheus/prometheus.yml.j2 index cd0ee0b..183f570 100644 --- a/templates/prometheus/config/prometheus/prometheus.yml.j2 +++ b/templates/prometheus/config/prometheus/prometheus.yml.j2 @@ -25,9 +25,9 @@ alerting: # Here it's Prometheus itself. scrape_configs: -############################################## -### Applications #### -############################################## +############################################### +### Monitoring #### +############################################### - job_name: 'prometheus' static_configs: @@ -36,13 +36,48 @@ scrape_configs: ] labels: env: {{ stage }} - project: smardigo + project: monitoring + application: prometheus relabel_configs: - source_labels: [__address__] target_label: instance - replacement: 'prometheus.{{ domain }}' + replacement: '{{ inventory_hostname }}-prometheus.{{ domain }}' -############################################## + - job_name: 'alertmanager' + static_configs: + - targets: [ + '{{ inventory_hostname }}-alertmanager:9093' + ] + labels: + env: {{ stage }} + project: monitoring + application: alertmanager + relabel_configs: + - source_labels: [__address__] + target_label: instance + replacement: '{{ inventory_hostname }}-alertmanager.{{ domain }}' + + - job_name: 'blackbox' + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: {{ blackbox_http_2xx_targets + blackbox_http_2xx_additional_targets }} + labels: + env: {{ stage }} + project: monitoring + application: blackbox + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: "{{ blackbox_exporter_fqdn }}:9115" + +############################################ +### Traefik #### +############################################ - job_name: 'traefik' scheme: {{ http_s }} @@ -61,7 +96,7 @@ scrape_configs: ] labels: env: {{ stage }} - project: smardigo + project: traefik application: traefik relabel_configs: - source_labels: [__address__] @@ -69,7 +104,9 @@ scrape_configs: target_label: instance replacement: $1 -############################################## +########################################### +### Harbor #### +########################################### - job_name: 'harbor-exporter' scheme: {{ http_s }} @@ -141,7 +178,9 @@ scrape_configs: target_label: instance replacement: $1 -############################################## +############################################# +### Smardigo #### +############################################# - job_name: 'connect' scheme: {{ http_s }} @@ -206,7 +245,9 @@ scrape_configs: target_label: instance replacement: $1 -############################################## +############################################# +### Keycloak #### +############################################# - job_name: 'keycloak' scheme: {{ http_s }} @@ -221,7 +262,7 @@ scrape_configs: ] labels: env: {{ stage }} - project: smardigo + project: keycloak application: keycloak relabel_configs: - source_labels: [__address__] @@ -230,6 +271,35 @@ scrape_configs: replacement: $1 ############################################## +### AWX #### +############################################## + + - job_name: 'awx' + scheme: {{ http_s }} + metrics_path: '/api/v2/metrics' + scrape_interval: 5s + basic_auth: + username: '{{ awx_ansible_username }}' + password: '{{ awx_ansible_password }}' + static_configs: + - targets: ['{{ shared_service_kube_awx_hostname }}'] + labels: + env: {{ stage }} + project: awx + application: awx + relabel_configs: + - source_labels: [__address__] + regex: (.*):.* + target_label: instance + replacement: $1 + - source_labels: [__address__] + regex: (.*):.* + target_label: __address__ + replacement: $1 + +################################################### +### Infrastructure #### +################################################### - job_name: 'gitea' scheme: {{ http_s }} @@ -244,16 +314,14 @@ scrape_configs: ] labels: env: {{ stage }} - project: smardigo + project: infrastructure application: gitea relabel_configs: - source_labels: [__address__] - regex: (.*) + regex: (.*):.* target_label: instance replacement: $1 -############################################## - - job_name: 'redis' scheme: http metrics_path: '/metrics' @@ -267,7 +335,7 @@ scrape_configs: ] labels: env: {{ stage }} - project: smardigo + project: infrastructure application: redis relabel_configs: - source_labels: [__address__] @@ -296,11 +364,8 @@ scrape_configs: ] labels: env: {{ stage }} - project: smardigo + project: servers relabel_configs: - - source_labels: [job] - target_label: job - replacement: 'node-exporter' - source_labels: [__address__] regex: .*!(.*) target_label: instance @@ -318,7 +383,7 @@ scrape_configs: - '{{ blackbox_exporter_fqdn }}:9100' labels: env: {{ stage }} - project: smardigo + project: servers relabel_configs: - source_labels: [job] target_label: job @@ -332,9 +397,9 @@ scrape_configs: target_label: __address__ replacement: $1 -############################################## -### Databases #### -############################################## +############################################# +### Database #### +############################################# - job_name: 'elasticsearch-exporter' scheme: http @@ -349,11 +414,9 @@ scrape_configs: ] labels: env: {{ stage }} - project: smardigo + project: database + application: elasticsearch relabel_configs: - - source_labels: [job] - target_label: job - replacement: 'elasticsearch-exporter' - source_labels: [__address__] regex: (.*):.* target_label: instance @@ -372,11 +435,9 @@ scrape_configs: ] labels: env: {{ stage }} - project: smardigo + project: database + application: postgres relabel_configs: - - source_labels: [job] - target_label: job - replacement: 'postgres-exporter' - source_labels: [__address__] regex: .*!(.*) target_label: instance @@ -399,11 +460,9 @@ scrape_configs: ] labels: env: {{ stage }} - project: smardigo + project: database + application: maria relabel_configs: - - source_labels: [job] - target_label: job - replacement: 'maria-exporter' - source_labels: [__address__] regex: .*!(.*) target_label: instance @@ -413,30 +472,9 @@ scrape_configs: target_label: __address__ replacement: $1 -############################################## -### blackbox #### -############################################## - - - job_name: 'blackbox_smardigo' - metrics_path: /probe - params: - module: [http_2xx] - static_configs: - - targets: {{ blackbox_http_2xx_targets + blackbox_http_2xx_additional_targets }} - labels: - env: {{ stage }} - project: smardigo - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: "{{ blackbox_exporter_fqdn }}:9115" - {% if prometheus_federation_enabled %} ############################################## -### federation #### +### Federation #### ############################################## - job_name: 'federate - kube' @@ -451,19 +489,5 @@ scrape_configs: static_configs: - targets: ['{{ kubernetes_prometheus_endpoint }}'] -{% endif %} - -############################################## -### awx #### -############################################## - - - job_name: 'awx' - metrics_path: '/api/v2/metrics' - scrape_interval: 5s - scheme: https - basic_auth: - username: '{{ awx_ansible_username }}' - password: '{{ awx_ansible_password }}' - static_configs: - - targets: ['{{ shared_service_kube_awx_hostname }}'] +{% endif %}