feat: setup prometheus stack
parent
c10d556038
commit
57ff124d00
@ -1,30 +1,43 @@
|
||||
$ANSIBLE_VAULT;1.1;AES256
|
||||
37303936393666363130613561666665623364303361613938633232363532313830316231353935
|
||||
3335653838653863326334623161336336653435373664310a363561353430356166373766393836
|
||||
31303463343336613665326636643837633362636335343830396438363634656639376463353965
|
||||
6131343431653439340a363930643966386237323763613566393235303437376132393865323230
|
||||
63616136386133326131656565306233343831343030313935663764353330653231666533333238
|
||||
37343938363431333936613065613633396231306365346435623362653437326536663135333532
|
||||
38303838663865623737663866633839333835363864616330626335323338626331316263653233
|
||||
39343965666461653538303632636564336338313162663833623365396465336232366236383034
|
||||
61393839616164386565373930623338323130626330316565383338623634663338366233613963
|
||||
32306431383136346263633364626265363737353363396131313461656139393239306537343435
|
||||
34643830373965303339373831393465366565663936663061663434633131303033333436346566
|
||||
31376330613939356534613534313335653464616436393137396165326262636538656137326532
|
||||
63396138383364646339353539363230306461373732333037663862306161333966313462356363
|
||||
36656639346238313839623232373738376530633361373565353063323065626634306532626539
|
||||
37393038633761396539353233666563316535323965363233656134393365356339626565656134
|
||||
63323864653566646531393830396337353139653831343866303039313631613334313431643161
|
||||
39383264646566636538626433333937333230383564316437353464613862316532343564316530
|
||||
64623935383037326563613533313361333435326166343339616461386437356238376263356334
|
||||
33373166613033626130333962366464663262393134623838633937653837653332393061626637
|
||||
66653730396436313339616562626230363231303136333235663534626266613831646631633530
|
||||
39356263346231373463373761626430376431633135353939656664613632633965323838633362
|
||||
65633335643866633530346236653435343565663936376266663862363130303032323436646133
|
||||
66643833653363323935353636343430346561346262383436663838636536386638356438663430
|
||||
65363262396339323530303663333730313836346565623430633232366138376261393831643137
|
||||
37373734333538313566306631373233353364656438323435373265306531396534656265633532
|
||||
31643831353931393139663861346234333233353566333435373338393166376333343235303034
|
||||
37376565643162303531396566313531643933376663343663636230376338666565323263666539
|
||||
65373832373931393265333432313232633536646331633833613561366532363239326538333565
|
||||
3134
|
||||
64643166313265363734313932666666643238333366393865343132313835666433326366653337
|
||||
6135633264613662366233323835663034373761663864350a663161316266653238323332656336
|
||||
35323166323062323465623933653538356334666230616339313533613431613234653136386230
|
||||
3764333134323538310a353530616532613365326131376664386335336161326638663335326530
|
||||
38383166343939316537396332313664313064613561393036353164626566343136623835623237
|
||||
62383834376661643537356335646462323962633432336238333033343666326230326639363364
|
||||
37666261343733373839613362346166666231663463616436363838626134663861616566663137
|
||||
64613666616336303134393161323262386264666232336132376534316461333764363037376531
|
||||
63363433363233653839616132373038623436333866326338343130343734323662323137643033
|
||||
66313664636565323333646238363339653235346362393435323032656536373838643765313562
|
||||
61343437353165666464316135366266623263383033346534666538383566303162393430393761
|
||||
39663732636164346633396230336538376236663330323363626132383964323530336338363836
|
||||
37333837343836616231643730626134303031376130393431646464646438336334343565326463
|
||||
65623237373166303662336362663636333964643866643638666132303862626264353064616163
|
||||
39333130396237343431336132383238343535363834356462393430363162643635356363383238
|
||||
64626434616365346238626236366232333333366431336436363863316563313462366538366436
|
||||
33373264623837346232303131653464376534646438643332626566613735653439646661383536
|
||||
38346263346336346137616337363435666466343836656638653638646133303733363365323934
|
||||
64393631303163633061393530623535313961313737643638626665303363366439306366373064
|
||||
65333563636631373931313837653738356234393036323165663036653565386663313938373430
|
||||
34656233326230356262306464323563393066646262613933653032333864326261626330643333
|
||||
33636464396263323563626335306432373764353265373833653230333837653363333761666136
|
||||
34653035393262623134353361323230323238653034316466663663346462353337613939313238
|
||||
61393930373037313266663563386632343262373061333838646531373666383535323065646639
|
||||
66326539393061373465613130643761346330623866633263336532663966366339323665323363
|
||||
39386539633163653233356439303635646666303662393235316238393934633066373866623230
|
||||
34316366656130393738353637626166383563343233383163383639373539626234363265356532
|
||||
65303739393637656433656164373934613237336436326630393535633637323865386531646638
|
||||
63316365623639373332323366373461393766633662396562306534306466653162633131623131
|
||||
65306334656535383137343830323966346337323363343663326438613562643466643666386537
|
||||
64326334356561653231346433396439666237626336666239336463333536376130373866343736
|
||||
32396233333161313230656461396361626435666664616462363036386636396364636364323966
|
||||
31656130323264363862656461616562613934636636373535343333666565626134376266613937
|
||||
66393266613635313030356263366235323139663439303861356665333163386334646339613933
|
||||
62636338343237376630376364323763383562383462613366393738663237643931636161383631
|
||||
30623834383839613531616435613833636662313664323166363935396231643430376330396431
|
||||
36366235393933613362303466343433643731363835343862346131343836376132316536633034
|
||||
32333263313031313464343562633835323663363965373465633433386566313832346639623232
|
||||
63393832643937396130613638623231663137303832616266326461636164393565336537656437
|
||||
62393866636233343766633863643532396138636638326531326430613634353564386633343265
|
||||
33613930356433356139623830326165323632633039333837623136376661303736356661343364
|
||||
3736353162636662646162333934306562626662633931386565
|
||||
|
||||
@ -0,0 +1,5 @@
|
||||
---
|
||||
|
||||
stage: "dev"
|
||||
|
||||
alertmanager_channel_smardigo: "#monitoring-qa"
|
||||
@ -0,0 +1,133 @@
|
||||
---
|
||||
|
||||
connect_image_name: 'docker.arxes-tolina.de/smardigo/connect-whitelabel-app'
|
||||
|
||||
connect_version: 'latest'
|
||||
connect_admin_username: "connect-admin"
|
||||
connect_admin_password: "connect-admin"
|
||||
|
||||
connect_postgres_version: "12"
|
||||
connect_postgres_database: "connect-postgres"
|
||||
connect_postgres_admin_username: "connect-postgres-admin"
|
||||
connect_postgres_admin_password: "connect-postgres-admin"
|
||||
|
||||
connect_mail_properties_base_url: "{{ http_s }}://{{ service_url }}"
|
||||
connect_mail_properties_base_url_extern: "{{ http_s }}://{{ service_url }}"
|
||||
|
||||
connect_id: "{{ service_name }}-connect"
|
||||
connect_postgres_id: "{{ service_name }}-postgres-connect"
|
||||
|
||||
connect_labels: [
|
||||
'"traefik.enable=true"',
|
||||
'"traefik.http.routers.{{ connect_id }}.service={{ connect_id }}"',
|
||||
'"traefik.http.routers.{{ connect_id }}.rule=Host(`{{ service_url }}`)"',
|
||||
'"traefik.http.routers.{{ connect_id }}.entrypoints=websecure"',
|
||||
'"traefik.http.routers.{{ connect_id }}.tls=true"',
|
||||
'"traefik.http.routers.{{ connect_id }}.tls.certresolver=letsencrypt"',
|
||||
'"traefik.http.services.{{ connect_id }}.loadbalancer.server.port={{ service_port }}"',
|
||||
|
||||
'"traefik.http.routers.{{ connect_id }}-admin.service={{ connect_id }}-admin"',
|
||||
'"traefik.http.routers.{{ connect_id }}-admin.rule=Host(`{{ service_url }}`)"',
|
||||
'"traefik.http.routers.{{ connect_id }}-admin.entrypoints=admin-service"',
|
||||
'"traefik.http.routers.{{ connect_id }}-admin.tls=true"',
|
||||
'"traefik.http.routers.{{ connect_id }}-admin.tls.certresolver=letsencrypt"',
|
||||
'"traefik.http.routers.{{ connect_id }}-admin.middlewares={{ connect_id }}-admin-cors"',
|
||||
'"traefik.http.middlewares.{{ connect_id }}-admin-cors.headers.accesscontrolallowmethods=GET,OPTIONS"',
|
||||
'"traefik.http.middlewares.{{ connect_id }}-admin-cors.headers.accesscontrolalloworigin=*"',
|
||||
'"traefik.http.middlewares.{{ connect_id }}-admin-cors.headers.accesscontrolallowheaders=SMA_USER"',
|
||||
'"traefik.http.services.{{ connect_id }}-admin.loadbalancer.server.port={{ management_port }}"',
|
||||
|
||||
'"traefik.http.routers.{{ connect_id }}-monitor.service=node-exporter"',
|
||||
'"traefik.http.routers.{{ connect_id }}-monitor.rule=Host(`{{ service_url }}`)"',
|
||||
'"traefik.http.routers.{{ connect_id }}-monitor.entrypoints=admin-system"',
|
||||
'"traefik.http.routers.{{ connect_id }}-monitor.tls=true"',
|
||||
'"traefik.http.routers.{{ connect_id }}-monitor.tls.certresolver=letsencrypt"',
|
||||
]
|
||||
|
||||
connect_docker: {
|
||||
networks: [
|
||||
{
|
||||
name: back-tier,
|
||||
external: true,
|
||||
},
|
||||
{
|
||||
name: front-tier,
|
||||
external: true,
|
||||
},
|
||||
],
|
||||
volumes: [
|
||||
{
|
||||
name: "{{ connect_postgres_id }}-data"
|
||||
}
|
||||
],
|
||||
services: [
|
||||
{
|
||||
name: "{{ connect_id }}",
|
||||
image_name: "{{ connect_image_name }}",
|
||||
image_version: "{{ connect_version }}",
|
||||
labels: "{{ connect_labels + ( connect_labels_additional | default([])) }}",
|
||||
restart: "{{ connect_service_restart | default('always') }}",
|
||||
environment: [
|
||||
"ADMIN_LOGIN: \"{{ connect_admin_username }}\"",
|
||||
"ADMIN_PASSWORD: \"{{ connect_admin_password }}\"",
|
||||
|
||||
"DATASOURCE_URL: \"jdbc:postgresql://{{ connect_postgres_id }}:{{ service_port_postgres }}/{{ connect_postgres_database }}\"",
|
||||
"DATASOURCE_USERNAME: \"{{ connect_postgres_admin_username }}\"",
|
||||
"DATASOURCE_PASSWORD: \"{{ connect_postgres_admin_password }}\"",
|
||||
|
||||
"MAIL_PROTOCOL: \"{{ connect_mail_protocol | default('smtp') }}\"",
|
||||
"MAIL_HOST: \"{{ connect_mail_host | default('smtp.tolina.local') }}\"",
|
||||
"MAIL_PORT: \"{{ connect_mail_port | default('25') }}\"",
|
||||
"MAIL_USER: \"{{ connect_mail_user | default('') }}\"",
|
||||
"MAIL_PASSWORD: \"{{ connect_mail_password | default('') }}\"",
|
||||
"MAIL_PROPERTIES_SIMULATION: \"{{ connect_mail_properties_simulation | default('true') }}\"",
|
||||
"MAIL_PROPERTIES_BASE_URL: \"{{ connect_mail_properties_base_url }}\"",
|
||||
"MAIL_PROPERTIES_BASE_URL_EXTERN: \"{{ connect_mail_properties_base_url_extern }}\"",
|
||||
"MAIL_PROPERTIES_SENDER: \"{{ connect_mail_properties_sender | default('noreply-connect@arxes-tolina.de') }}\"",
|
||||
"MAIL_PROPERTIES_SENDER_ALIAS: \"{{ connect_mail_properties_sender_alias | default('noreply-connect') }}\"",
|
||||
|
||||
"AUTH_MODULE: \"{{ connect_auth_module | default('preauth') }}\"",
|
||||
"OIDC_CLIENT_ID: \"{{ connect_oidc_client_id | default('oidc_config_not_found') }}\"",
|
||||
"OIDC_CLIENT_SECRET: \"{{ connect_oidc_client_secret | default('oidc_config_not_found') }}\"",
|
||||
"OIDC_REGISTRATION_ID: \"{{ connect_oidc_registration_id | default('oidc_config_not_found') }}\"",
|
||||
"OIDC_ISSUER_URI: \"{{ connect_oidc_issuer_uri | default('oidc_config_not_found') }}\"",
|
||||
"PASSWORD_CHANGE_URL: \"{{ connect_password_change_url | default('') }}\"",
|
||||
"USER_MANAGEMENT_URL: \"{{ connect_iam_user_management_url | default('') }}\"",
|
||||
|
||||
"IAM_MODULE: \"{{ connect_iam_module | default('embedded') }}\"",
|
||||
"IAM_CLIENT_ENABLED: \"{{ smardigo_iam_client_enabled | default('false') }}\"",
|
||||
"EXTERNAL_IAM_SERVER_URL: \"{{ smardigo_iam_client_server_url | default('') }}\"",
|
||||
|
||||
"SMA_API_TOKEN_SECRET: \"{{ connect_api_token_secret | default('') }}\"",
|
||||
|
||||
"SMA_CSRF_TOKEN_NAME: \"{{ connect_csrf_token_name | default('') }}\"",
|
||||
"SMA_CSRF_TOKEN_VALUE: \"{{ connect_csrf_token_value | default('') }}\"",
|
||||
|
||||
"SPRING_PROFILES_INCLUDE: \"{{ spring_profiles_include | default('swagger') }}\"",
|
||||
"RIBBON_DISPLAY_ON_ACTIVE_PROFILES: \"{{ ribbon_display_on_active_profiles | default('dev') }}\"",
|
||||
],
|
||||
networks: [
|
||||
'"back-tier"',
|
||||
'"front-tier"',
|
||||
],
|
||||
extra_hosts: "{{ connect_extra_hosts | default([]) }}",
|
||||
},
|
||||
{
|
||||
name: "{{ connect_postgres_id }}",
|
||||
image_name: "postgres",
|
||||
image_version: "{{ connect_postgres_version }}",
|
||||
environment: [
|
||||
'POSTGRES_DB: "{{ connect_postgres_database }}"',
|
||||
'POSTGRES_USER: "{{ connect_postgres_admin_username }}"',
|
||||
'POSTGRES_PASSWORD: "{{ connect_postgres_admin_password }}"',
|
||||
],
|
||||
volumes: [
|
||||
'"{{ connect_postgres_id }}-data:/var/lib/postgresql/data"',
|
||||
],
|
||||
networks: [
|
||||
'"back-tier"',
|
||||
],
|
||||
ports: "{{ connect_postgres_ports | default([]) }}",
|
||||
},
|
||||
],
|
||||
}
|
||||
@ -0,0 +1 @@
|
||||
---
|
||||
@ -0,0 +1 @@
|
||||
---
|
||||
@ -0,0 +1,188 @@
|
||||
---
|
||||
|
||||
### tags:
|
||||
### create_users
|
||||
### update_deployment
|
||||
|
||||
|
||||
- name: "Send mattermost message"
|
||||
uri:
|
||||
url: "{{ mattermost_hook_smardigo }}"
|
||||
method: POST
|
||||
body: "{{ lookup('template','mattermost-deploy-start.json.j2') }}"
|
||||
body_format: json
|
||||
headers:
|
||||
Content-Type: "application/json"
|
||||
delegate_to: 127.0.0.1
|
||||
become: false
|
||||
when:
|
||||
- send_status_messages
|
||||
|
||||
- name: "Setup DNS configuration for {{ service_name }}"
|
||||
include_role:
|
||||
name: _digitalocean
|
||||
tasks_from: domain
|
||||
vars:
|
||||
record_data: "{{ stage_server_ip }}"
|
||||
record_name: "{{ service_name }}"
|
||||
|
||||
- name: "Setup public DNS configuration for {{ service_name }}"
|
||||
include_role:
|
||||
name: _digitalocean
|
||||
tasks_from: domain
|
||||
vars:
|
||||
record_data: "{{ item.ip }}"
|
||||
record_name: "{{ item.name }}"
|
||||
loop: "{{ connect_public_dns_entries }}"
|
||||
when: connect_public_dns_entries is defined
|
||||
|
||||
- name: "Check docker networks"
|
||||
include_role:
|
||||
name: _docker
|
||||
tasks_from: networks
|
||||
|
||||
- name: "Check if {{ service_name }}/docker-compose.yml exists"
|
||||
stat:
|
||||
path: '{{ service_base_path }}/{{ service_name }}/docker-compose.yml'
|
||||
register: check_docker_compose_file
|
||||
tags:
|
||||
- update_deployment
|
||||
|
||||
- name: "Stop {{ service_name }}"
|
||||
shell: docker-compose down
|
||||
args:
|
||||
chdir: '{{ service_base_path }}/{{ service_name }}'
|
||||
when: check_docker_compose_file.stat.exists
|
||||
ignore_errors: yes
|
||||
tags:
|
||||
- update_deployment
|
||||
|
||||
- name: "Deploy service configuration for {{ service_name }}"
|
||||
include_role:
|
||||
name: _deploy
|
||||
tasks_from: configs
|
||||
vars:
|
||||
current_config: "connect"
|
||||
current_base_path: "{{ service_base_path }}"
|
||||
current_destination: "{{ service_name }}"
|
||||
current_owner: "{{ docker_owner }}"
|
||||
current_group: "{{ docker_group }}"
|
||||
current_docker: "{{ connect_docker }}"
|
||||
|
||||
- name: "Update {{ service_name }}"
|
||||
shell: docker-compose pull
|
||||
args:
|
||||
chdir: '{{ service_base_path }}/{{ service_name }}'
|
||||
tags:
|
||||
- update_deployment
|
||||
|
||||
- name: "Start {{ service_name }}"
|
||||
shell: docker-compose up -d
|
||||
args:
|
||||
chdir: '{{ service_base_path }}/{{ service_name }}'
|
||||
tags:
|
||||
- update_deployment
|
||||
|
||||
- name: "Update caddy configuration for {{ service_name }}"
|
||||
include_role:
|
||||
name: _deploy
|
||||
tasks_from: caddy_config
|
||||
vars:
|
||||
current_service: "{{ service_name }}"
|
||||
current_services: [
|
||||
{
|
||||
external: "{{ service_url }}",
|
||||
internal: "{{ service_name }}-connect:{{ service_port }}",
|
||||
},
|
||||
{
|
||||
external: "{{ service_url }}:{{ monitor_port_service }}",
|
||||
internal: "{{ service_name }}-connect:{{ management_port }}",
|
||||
},
|
||||
{
|
||||
external: "{{ service_url }}:{{ monitor_port_system }}",
|
||||
internal: "node-exporter:{{ service_port_node_exporter }}",
|
||||
},
|
||||
]
|
||||
|
||||
- name: "Update public caddy configuration for {{ service_name }}"
|
||||
include_role:
|
||||
name: _deploy
|
||||
tasks_from: caddy_config
|
||||
vars:
|
||||
current_service: "{{ service_name }}-{{ item.name }}"
|
||||
current_services: [
|
||||
{
|
||||
external: "{{ item.name }}.{{ domain }}",
|
||||
internal: "{{ service_name }}-connect:{{ service_port }}",
|
||||
}
|
||||
]
|
||||
loop: "{{ connect_public_dns_entries }}"
|
||||
when: connect_public_dns_entries is defined
|
||||
|
||||
- name: "Update landing page entries for {{ service_name }}"
|
||||
include_role:
|
||||
name: _deploy
|
||||
tasks_from: caddy_landing_page
|
||||
vars:
|
||||
current_services: [
|
||||
{
|
||||
current_name: "{{ service_name }}",
|
||||
current_url: "{{ http_s }}://{{ service_url }}",
|
||||
current_version: "{{ connect_version }}",
|
||||
current_date: "{{ ansible_date_time.iso8601 }}",
|
||||
management: "{{ http_s }}://{{ service_url }}:{{ monitor_port_service }}/management",
|
||||
},
|
||||
]
|
||||
tags:
|
||||
- update_deployment
|
||||
|
||||
- name: "Update landing page with public entries {{ service_name }}"
|
||||
include_role:
|
||||
name: _deploy
|
||||
tasks_from: caddy_landing_page
|
||||
vars:
|
||||
current_services: [
|
||||
{
|
||||
current_name: "{{ item.name }}",
|
||||
current_url: "{{ http_s }}://{{ item.name }}.{{ domain }}",
|
||||
current_version: "{{ connect_version }}",
|
||||
current_date: "{{ ansible_date_time.iso8601 }}",
|
||||
management: "{{ http_s }}://{{ service_url }}:{{ monitor_port_service }}/management",
|
||||
},
|
||||
]
|
||||
loop: "{{ connect_public_dns_entries }}"
|
||||
when: connect_public_dns_entries is defined
|
||||
tags:
|
||||
- update_deployment
|
||||
|
||||
- name: "Update landing page with extra entries for {{ service_name }}"
|
||||
include_role:
|
||||
name: _deploy
|
||||
tasks_from: caddy_landing_page
|
||||
vars:
|
||||
current_services: [
|
||||
{
|
||||
current_name: "{{ item.name }}",
|
||||
current_url: "{{ item.domain }}",
|
||||
current_version: "{{ connect_version }}",
|
||||
current_date: "{{ ansible_date_time.iso8601 }}",
|
||||
management: "{{ http_s }}://{{ service_url }}:{{ monitor_port_service }}/management",
|
||||
},
|
||||
]
|
||||
loop: "{{ connect_extra_domain_entries }}"
|
||||
when: connect_extra_domain_entries is defined
|
||||
tags:
|
||||
- update_deployment
|
||||
|
||||
- name: "Send mattermost messsge"
|
||||
uri:
|
||||
url: "{{ mattermost_hook_smardigo }}"
|
||||
method: POST
|
||||
body: "{{ lookup('template','mattermost-deploy-end.json.j2') }}"
|
||||
body_format: json
|
||||
headers:
|
||||
Content-Type: "application/json"
|
||||
delegate_to: 127.0.0.1
|
||||
become: false
|
||||
when:
|
||||
- send_status_messages
|
||||
@ -0,0 +1,5 @@
|
||||
{
|
||||
"id": "{{ current_user.userId }}",
|
||||
"firstName": "{{ current_user.firstName | default('null') }}",
|
||||
"lastName": "{{ current_user.lastName | default('null') }}"
|
||||
}
|
||||
@ -0,0 +1 @@
|
||||
---
|
||||
@ -0,0 +1,132 @@
|
||||
---
|
||||
|
||||
grafana_version: "7.0.5"
|
||||
prometheus_version: "v2.19.2"
|
||||
alertmanager_version: "v0.21.0"
|
||||
blackbox_exporter_version: "v0.17.0"
|
||||
postgres_exporter_version: "v0.8.0"
|
||||
|
||||
service_port_grafana: 3000
|
||||
service_port_prometheus: 9090
|
||||
service_port_alertmanager: 9093
|
||||
service_port_blackbox_exporter: 9115
|
||||
service_port_postgres_exporter: 9187
|
||||
|
||||
prometheus_id: "{{ service_name }}-prometheus"
|
||||
alertmanager_id: "{{ service_name }}-alertmanager"
|
||||
grafana_id: "{{ service_name }}-grafana"
|
||||
|
||||
prometheus_docker: {
|
||||
networks: [
|
||||
{
|
||||
name: back-tier,
|
||||
external: true,
|
||||
},
|
||||
{
|
||||
name: front-tier,
|
||||
external: true,
|
||||
},
|
||||
],
|
||||
volumes: [
|
||||
{
|
||||
name: "{{ prometheus_id }}-data"
|
||||
},
|
||||
{
|
||||
name: "{{ alertmanager_id }}-data"
|
||||
},
|
||||
{
|
||||
name: "{{ grafana_id }}-data"
|
||||
},
|
||||
],
|
||||
services: [
|
||||
{
|
||||
name: "{{ prometheus_id }}",
|
||||
image_name: "prom/prometheus",
|
||||
image_version: "{{ prometheus_version }}",
|
||||
labels: [
|
||||
'"traefik.enable=true"',
|
||||
'"traefik.http.routers.{{ prometheus_id }}.service={{ prometheus_id }}"',
|
||||
'"traefik.http.routers.{{ prometheus_id }}.rule=Host(`{{ service_name }}-prometheus.{{ domain }}`)"',
|
||||
'"traefik.http.routers.{{ prometheus_id }}.entrypoints=websecure"',
|
||||
'"traefik.http.routers.{{ prometheus_id }}.tls=true"',
|
||||
'"traefik.http.routers.{{ prometheus_id }}.tls.certresolver=letsencrypt"',
|
||||
'"traefik.http.services.{{ prometheus_id }}.loadbalancer.server.port={{ service_port_prometheus }}"',
|
||||
],
|
||||
command: [
|
||||
'"--config.file=/etc/prometheus/prometheus.yml"',
|
||||
'"--storage.tsdb.path=/prometheus"',
|
||||
'"--web.console.libraries=/usr/share/prometheus/console_libraries"',
|
||||
'"--web.console.templates=/usr/share/prometheus/consoles"',
|
||||
'"--web.external-url={{ http_s}}://{{ service_name }}-prometheus.{{ domain }}"',
|
||||
'"--web.enable-lifecycle"',
|
||||
'"--storage.tsdb.retention.time=30w"',
|
||||
],
|
||||
volumes: [
|
||||
'"./config/prometheus/:/etc/prometheus/:ro"',
|
||||
'"{{ prometheus_id }}-data:/prometheus"',
|
||||
],
|
||||
networks: [
|
||||
'"back-tier"',
|
||||
'"front-tier"',
|
||||
],
|
||||
extra_hosts: "{{ prometheus_extra_hosts | default([]) }}",
|
||||
},
|
||||
{
|
||||
name: "{{ alertmanager_id }}",
|
||||
image_name: "prom/alertmanager",
|
||||
image_version: "{{ alertmanager_version }}",
|
||||
labels: [
|
||||
'"traefik.enable=true"',
|
||||
'"traefik.http.routers.{{ alertmanager_id }}.service={{ alertmanager_id }}"',
|
||||
'"traefik.http.routers.{{ alertmanager_id }}.rule=Host(`{{ service_name }}-alertmanager.{{ domain }}`)"',
|
||||
'"traefik.http.routers.{{ alertmanager_id }}.entrypoints=websecure"',
|
||||
'"traefik.http.routers.{{ alertmanager_id }}.tls=true"',
|
||||
'"traefik.http.routers.{{ alertmanager_id }}.tls.certresolver=letsencrypt"',
|
||||
'"traefik.http.services.{{ alertmanager_id }}.loadbalancer.server.port={{ service_port_alertmanager }}"',
|
||||
],
|
||||
command: [
|
||||
'"--config.file=/etc/alertmanager/config.yml"',
|
||||
'"--storage.path=/alertmanager"',
|
||||
'"--web.external-url={{ http_s}}://{{ service_name }}-alertmanager.{{ domain }}"',
|
||||
],
|
||||
environment: [
|
||||
'LS_JAVA_OPTS: "-Xmx1G -Xms1G"',
|
||||
],
|
||||
volumes: [
|
||||
'"./config/alertmanager/:/etc/alertmanager/:ro"',
|
||||
'"{{ alertmanager_id }}-data:/alertmanager"',
|
||||
],
|
||||
networks: [
|
||||
'"back-tier"',
|
||||
'"front-tier"',
|
||||
],
|
||||
},
|
||||
{
|
||||
name: "{{ grafana_id }}",
|
||||
image_name: "grafana/grafana",
|
||||
image_version: "{{ grafana_version }}",
|
||||
user: '"472"',
|
||||
labels: [
|
||||
'"traefik.enable=true"',
|
||||
'"traefik.http.routers.{{ grafana_id }}.service={{ grafana_id }}"',
|
||||
'"traefik.http.routers.{{ grafana_id }}.rule=Host(`{{ service_name }}-grafana.{{ domain }}`)"',
|
||||
'"traefik.http.routers.{{ grafana_id }}.entrypoints=websecure"',
|
||||
'"traefik.http.routers.{{ grafana_id }}.tls=true"',
|
||||
'"traefik.http.routers.{{ grafana_id }}.tls.certresolver=letsencrypt"',
|
||||
'"traefik.http.services.{{ grafana_id }}.loadbalancer.server.port={{ service_port_grafana }}"',
|
||||
],
|
||||
volumes: [
|
||||
'"./config/grafana/provisioning/:/etc/grafana/provisioning/"',
|
||||
'"./config/grafana/conf/defaults.ini:/usr/share/grafana/conf/defaults.ini"',
|
||||
'"{{ grafana_id }}-data:/var/lib/grafana"',
|
||||
],
|
||||
networks: [
|
||||
'"back-tier"',
|
||||
'"front-tier"',
|
||||
],
|
||||
env_file: [
|
||||
'"./config/grafana/config.monitoring"',
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
@ -0,0 +1 @@
|
||||
---
|
||||
@ -0,0 +1 @@
|
||||
---
|
||||
@ -0,0 +1,184 @@
|
||||
---
|
||||
|
||||
### tags:
|
||||
### update_config
|
||||
|
||||
- name: "Send mattermost messsge"
|
||||
uri:
|
||||
url: "{{ mattermost_hook_smardigo }}"
|
||||
method: POST
|
||||
body: "{{ lookup('template','mattermost-deploy-start.json.j2') }}"
|
||||
body_format: json
|
||||
headers:
|
||||
Content-Type: "application/json"
|
||||
delegate_to: 127.0.0.1
|
||||
become: false
|
||||
when:
|
||||
- send_status_messages
|
||||
|
||||
- name: Gather current server infos
|
||||
hcloud_server_info:
|
||||
api_token: "{{ hetzner_authentication_token }}"
|
||||
register: hetzner_server_infos
|
||||
delegate_to: 127.0.0.1
|
||||
become: false
|
||||
|
||||
- name: Save current server infos as variable (fact)
|
||||
set_fact:
|
||||
hetzner_server_infos_json: "{{ hetzner_server_infos.hcloud_server_info }}"
|
||||
delegate_to: 127.0.0.1
|
||||
become: false
|
||||
|
||||
- name: Read ip for {{ inventory_hostname }}
|
||||
set_fact:
|
||||
stage_server_ip: "{{ item.ipv4_address }}"
|
||||
when: item.name == inventory_hostname
|
||||
with_items: "{{ hetzner_server_infos_json }}"
|
||||
delegate_to: 127.0.0.1
|
||||
become: false
|
||||
|
||||
- name: "Setup DNS configuration for {{ service_name }} prometheus"
|
||||
include_role:
|
||||
name: _digitalocean
|
||||
tasks_from: domain
|
||||
vars:
|
||||
record_data: "{{ stage_server_ip }}"
|
||||
record_name: "{{ service_name }}-prometheus"
|
||||
|
||||
- name: "Setup DNS configuration for {{ service_name }} grafana"
|
||||
include_role:
|
||||
name: _digitalocean
|
||||
tasks_from: domain
|
||||
vars:
|
||||
record_data: "{{ stage_server_ip }}"
|
||||
record_name: "{{ service_name }}-grafana"
|
||||
|
||||
- name: "Setup DNS configuration for {{ service_name }} alertmanager"
|
||||
include_role:
|
||||
name: _digitalocean
|
||||
tasks_from: domain
|
||||
vars:
|
||||
record_data: "{{ stage_server_ip }}"
|
||||
record_name: "{{ service_name }}-alertmanager"
|
||||
|
||||
- name: "Check docker networks"
|
||||
include_role:
|
||||
name: _docker
|
||||
tasks_from: networks
|
||||
|
||||
- name: "Check if {{ service_name }}/docker-compose.yml exists"
|
||||
stat:
|
||||
path: '{{ service_base_path }}/{{ service_name }}/docker-compose.yml'
|
||||
register: check_docker_compose_file
|
||||
tags:
|
||||
- update_config
|
||||
|
||||
- name: "Stop {{ service_name }}"
|
||||
shell: docker-compose down
|
||||
args:
|
||||
chdir: '{{ service_base_path }}/{{ service_name }}'
|
||||
when: check_docker_compose_file.stat.exists
|
||||
ignore_errors: yes
|
||||
|
||||
- name: "Deploy service configuration for {{ service_name }}"
|
||||
include_role:
|
||||
name: _deploy
|
||||
tasks_from: configs
|
||||
vars:
|
||||
current_config: "prometheus"
|
||||
current_base_path: "{{ service_base_path }}"
|
||||
current_destination: "{{ service_name }}"
|
||||
current_owner: "{{ docker_owner }}"
|
||||
current_group: "{{ docker_group }}"
|
||||
current_docker: "{{ prometheus_docker }}"
|
||||
tags:
|
||||
- update_config
|
||||
|
||||
- name: "Start {{ service_name }}"
|
||||
shell: |
|
||||
docker-compose down
|
||||
docker-compose up -d
|
||||
args:
|
||||
chdir: '{{ service_base_path }}/{{ service_name }}'
|
||||
tags:
|
||||
- update_config
|
||||
|
||||
- name: "Update landing page for {{ service_name }}"
|
||||
include_role:
|
||||
name: _deploy
|
||||
tasks_from: caddy_landing_page
|
||||
vars:
|
||||
current_services: [
|
||||
{
|
||||
current_name: "{{ service_prefix }}prometheus",
|
||||
current_url: "{{ http_s}}://{{ service_name }}-prometheus.{{ domain }}",
|
||||
current_version: "{{ prometheus_version }}",
|
||||
current_date: "{{ ansible_date_time.iso8601 }}",
|
||||
},
|
||||
{
|
||||
current_name: "{{ service_prefix }}grafana",
|
||||
current_url: "{{ http_s }}://{{ service_name }}-grafana.{{ domain }}",
|
||||
current_version: "{{ grafana_version }}",
|
||||
current_date: "{{ ansible_date_time.iso8601 }}",
|
||||
},
|
||||
{
|
||||
current_name: "{{ service_prefix }}alertmanager",
|
||||
current_url: "{{ http_s }}://{{ service_name }}-alertmanager.{{ domain }}",
|
||||
current_version: "{{ alertmanager_version }}",
|
||||
current_date: "{{ ansible_date_time.iso8601 }}",
|
||||
},
|
||||
]
|
||||
|
||||
- name: "Wait for {{ http_s }}://{{ service_name }}-grafana.{{ domain }}"
|
||||
uri:
|
||||
url: "{{ http_s }}://{{ service_name }}-grafana.{{ domain }}/api/admin/stats"
|
||||
url_username: "{{ grafana_admin_username }}"
|
||||
url_password: "{{ grafana_admin_password }}"
|
||||
force_basic_auth: yes
|
||||
method: GET
|
||||
status_code: 200
|
||||
return_content: yes
|
||||
register: grafana_stats
|
||||
until: grafana_stats.status == 200
|
||||
retries: 10
|
||||
delay: 60
|
||||
|
||||
- name: Create grafana users
|
||||
uri:
|
||||
url: "{{ http_s }}://{{ service_name }}-grafana.{{ domain }}/api/admin/users"
|
||||
url_username: "{{ grafana_admin_username }}"
|
||||
url_password: "{{ grafana_admin_password }}"
|
||||
force_basic_auth: yes
|
||||
method: POST
|
||||
status_code: 200
|
||||
body_format: json
|
||||
body: "{\"name\":\"{{ item.name }}\", \"email\":\"{{ item.email }}\", \"login\":\"{{ item.login }}\", \"password\":\"{{ item.password }}\" }"
|
||||
headers:
|
||||
Content-Type: application/json
|
||||
loop:
|
||||
- {
|
||||
name: "{{ grafana_user_smardigo_login }}",
|
||||
email: "smardigo@arxes-tolina.de",
|
||||
login: "{{ grafana_user_smardigo_login }}",
|
||||
password: "{{ grafana_user_smardigo_password }}"
|
||||
}
|
||||
- {
|
||||
name: "{{ grafana_user_guest_login }}",
|
||||
email: "grafana-guest@arxes-tolina.de",
|
||||
login: "{{ grafana_user_guest_login }}",
|
||||
password: "{{ grafana_user_guest_password }}"
|
||||
}
|
||||
when: grafana_stats.json.users == 1
|
||||
|
||||
- name: "Send mattermost messsge"
|
||||
uri:
|
||||
url: "{{ mattermost_hook_smardigo }}"
|
||||
method: POST
|
||||
body: "{{ lookup('template','mattermost-deploy-end.json.j2') }}"
|
||||
body_format: json
|
||||
headers:
|
||||
Content-Type: "application/json"
|
||||
delegate_to: 127.0.0.1
|
||||
become: false
|
||||
when:
|
||||
- send_status_messages
|
||||
@ -0,0 +1,16 @@
|
||||
{% for database in databases %}
|
||||
{
|
||||
name: "{{ service_name }}-postgres-exporter-{{ database.name }}",
|
||||
image_name: "wrouesnel/postgres_exporter",
|
||||
image_version: "{{ postgres_exporter_version }}",
|
||||
environment: [
|
||||
'DATA_SOURCE_URI: "{{ database.uri }}"',
|
||||
"{{ database.user }}",
|
||||
"{{ database.pass }}",
|
||||
],
|
||||
networks: [
|
||||
'"back-tier"',
|
||||
'"front-tier"',
|
||||
],
|
||||
}
|
||||
{% endfor %}
|
||||
@ -0,0 +1 @@
|
||||
---
|
||||
@ -1,20 +1,27 @@
|
||||
[hcloud]
|
||||
[connect]
|
||||
dev-connect-01
|
||||
dev-connect-02
|
||||
dev-connect-03
|
||||
|
||||
[docker_registry]
|
||||
dev-docker-registry-01
|
||||
|
||||
[elastic]
|
||||
dev-elastic-stack-01
|
||||
dev-elastic-stack-02
|
||||
dev-elastic-stack-03
|
||||
dev-prometheus-01
|
||||
|
||||
[docker_registry]
|
||||
dev-docker-registry-01
|
||||
|
||||
[prometheus]
|
||||
dev-prometheus-01
|
||||
|
||||
[stage_dev:children]
|
||||
hcloud
|
||||
connect
|
||||
docker_registry
|
||||
elastic
|
||||
prometheus
|
||||
|
||||
[all:children]
|
||||
stage_dev
|
||||
|
||||
[hcloud:children]
|
||||
stage_dev
|
||||
|
||||
@ -0,0 +1,20 @@
|
||||
route:
|
||||
receiver: smardigo #default
|
||||
routes:
|
||||
- match:
|
||||
project: 'smardigo'
|
||||
env: '{{ stage }}'
|
||||
receiver: 'smardigo'
|
||||
|
||||
receivers:
|
||||
- name: 'smardigo'
|
||||
slack_configs:
|
||||
- api_url: '{{ mattermost_hook_smardigo }}'
|
||||
username: 'prometheus'
|
||||
channel: '{{ alertmanager_channel_smardigo }}'
|
||||
title: '{{ '{{' }} template "custom_title" . {{ '}}' }}'
|
||||
text: '{{ '{{' }} template "custom_slack_message" . {{ '}}' }}'
|
||||
send_resolved: true
|
||||
|
||||
templates:
|
||||
- /etc/alertmanager/templates/notifications.tmpl
|
||||
@ -0,0 +1,18 @@
|
||||
{{ define "__single_message_title" }}{{ range .Alerts.Firing }}{{ .Labels.alertname }} @ {{ .Annotations.identifier }}{{ end }}{{ range .Alerts.Resolved }}{{ .Labels.alertname }} @ {{ .Annotations.identifier }}{{ end }}{{ end }}
|
||||
|
||||
{{ define "custom_title" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ if or (and (eq (len .Alerts.Firing) 1) (eq (len .Alerts.Resolved) 0)) (and (eq (len .Alerts.Firing) 0) (eq (len .Alerts.Resolved) 1)) }}{{ template "__single_message_title" . }}{{ end }}{{ end }}
|
||||
|
||||
{{ define "custom_slack_message" }}
|
||||
{{ if or (and (eq (len .Alerts.Firing) 1) (eq (len .Alerts.Resolved) 0)) (and (eq (len .Alerts.Firing) 0) (eq (len .Alerts.Resolved) 1)) }}
|
||||
{{ range .Alerts.Firing }}{{ .Annotations.description }}{{ end }}{{ range .Alerts.Resolved }}{{ .Annotations.description }}{{ end }}
|
||||
{{ else }}
|
||||
{{ if gt (len .Alerts.Firing) 0 }}
|
||||
*Alerts Firing:*
|
||||
{{ range .Alerts.Firing }}- {{ .Annotations.summary }}
|
||||
{{ end }}{{ end }}
|
||||
{{ if gt (len .Alerts.Resolved) 0 }}
|
||||
*Alerts Resolved:*
|
||||
{{ range .Alerts.Resolved }}- {{ .Annotations.summary }}
|
||||
{{ end }}{{ end }}
|
||||
{{ end }}
|
||||
{{ end }}
|
||||
@ -0,0 +1,12 @@
|
||||
modules:
|
||||
http_200:
|
||||
prober: http
|
||||
timeout: 5s
|
||||
http:
|
||||
method: GET
|
||||
fail_if_ssl: false
|
||||
fail_if_not_ssl: false
|
||||
tls_config:
|
||||
insecure_skip_verify: false
|
||||
preferred_ip_protocol: "ip4"
|
||||
valid_status_codes: [200]
|
||||
@ -0,0 +1,2 @@
|
||||
GF_SECURITY_ADMIN_PASSWORD={{ grafana_admin_password }}
|
||||
GF_USERS_ALLOW_SIGN_UP=false
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,11 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Prometheus'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /etc/grafana/provisioning/dashboards
|
||||
@ -0,0 +1,50 @@
|
||||
# config file version
|
||||
apiVersion: 1
|
||||
|
||||
# list of datasources that should be deleted from the database
|
||||
deleteDatasources:
|
||||
- name: Prometheus
|
||||
orgId: 1
|
||||
|
||||
# list of datasources to insert/update depending
|
||||
# whats available in the database
|
||||
datasources:
|
||||
# <string, required> name of the datasource. Required
|
||||
- name: Prometheus
|
||||
# <string, required> datasource type. Required
|
||||
type: prometheus
|
||||
# <string, required> access mode. direct or proxy. Required
|
||||
access: proxy
|
||||
# <int> org id. will default to orgId 1 if not specified
|
||||
orgId: 1
|
||||
# <string> url
|
||||
url: http://{{ service_name }}-prometheus:9090
|
||||
# <string> database password, if used
|
||||
password:
|
||||
# <string> database user, if used
|
||||
user:
|
||||
# <string> database name, if used
|
||||
database:
|
||||
# <bool> enable/disable basic auth
|
||||
basicAuth: true
|
||||
# <string> basic auth username
|
||||
basicAuthUser: admin
|
||||
# <string> basic auth password
|
||||
basicAuthPassword: foobar
|
||||
# <bool> enable/disable with credentials headers
|
||||
withCredentials:
|
||||
# <bool> mark as default datasource. Max one per org
|
||||
isDefault: true
|
||||
# <map> fields that will be converted to json and stored in json_data
|
||||
jsonData:
|
||||
graphiteVersion: "1.1"
|
||||
tlsAuth: false
|
||||
tlsAuthWithCACert: false
|
||||
# <string> json object of data that will be encrypted.
|
||||
secureJsonData:
|
||||
tlsCACert: "..."
|
||||
tlsClientCert: "..."
|
||||
tlsClientKey: "..."
|
||||
version: 1
|
||||
# <bool> allow users to edit datasources from the UI.
|
||||
editable: true
|
||||
@ -0,0 +1,252 @@
|
||||
groups:
|
||||
- name: tolina
|
||||
rules:
|
||||
- alert: too_many_connections
|
||||
expr: pg_stat_database_numbackends > 20
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.datname {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.datname {{ '}}' }}> has too many open db connections"
|
||||
description: "Too many connections for more than 2 minutes."
|
||||
|
||||
- alert: service_down
|
||||
expr: up == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> down"
|
||||
description: "Down for more than 2 minutes."
|
||||
|
||||
- alert: high_load
|
||||
expr: 100 - (avg by (instance, env, instance, job) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"}[30s])) * 100) > 90
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> under high load"
|
||||
description: "High load for more than 2 minutes."
|
||||
|
||||
- alert: apt_upgrades_pending
|
||||
expr: apt_upgrades_pending > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> has pending updates."
|
||||
description: "Pending updates [{{ '{{' }} $labels.origin {{ '}}' }}] / [{{ '{{' }} $labels.arch {{ '}}' }}]."
|
||||
|
||||
- alert: reboot_required
|
||||
expr: node_reboot_required == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs reboot."
|
||||
description: "Need reboot!"
|
||||
|
||||
- alert: veeam_backup_failed
|
||||
expr: veeam_backup_failed == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed."
|
||||
description: "Backup failed."
|
||||
|
||||
- alert: probe_ssl_certificates
|
||||
expr: probe_ssl_earliest_cert_expiry{job=~"http_check_200_secure|http_check_403_secure"} - time() < 86400 * 30
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs a new certificate until next 30 days."
|
||||
description: "needs a new certificate until next 30 days."
|
||||
|
||||
- alert: probe_ssl_certificates_50
|
||||
expr: probe_ssl_earliest_cert_expiry{job="http_check_200_secure", instance="https://mail.sparkassenfinanzportal.de"} - time() < 86400 * 50
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs a new certificate until next 50 days."
|
||||
description: "needs a new certificate until next 50 days."
|
||||
|
||||
- alert: disk_space
|
||||
expr: predict_linear(node_filesystem_avail_bytes{device!="/dev/loop1",device!="veeamagent",device!="/dev/veeamimage1",env="{{ stage }}",fstype!="cifs",mountpoint=~"/|/rootfs"}[1d], 24 * 3600 * 14) < 1024 * 1024 * 1024 * 5
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> disk space falls below 5GB in the next 14 days."
|
||||
description: "disk space falls below 5GB in the next 14 days."
|
||||
|
||||
- alert: software_raid_disks_active
|
||||
expr: node_md_disks_active != 2
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> software-raid disks broken."
|
||||
description: "software-raid disks broken."
|
||||
|
||||
- alert: software_raid_active
|
||||
expr: node_md_is_active != 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> raid inactive."
|
||||
description: "software-raid inactive."
|
||||
|
||||
- alert: restic_backup_failed
|
||||
expr: (restic_cmd_return_code_mount + restic_cmd_return_code_backup + restic_cmd_return_code_forget + restic_cmd_return_code_prune + restic_cmd_return_code_check + restic_cmd_return_code_stats + restic_cmd_return_code_umount + restic_backup_failed) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed."
|
||||
description: "backup failed."
|
||||
|
||||
- alert: megaraid_smart_errors
|
||||
expr: megaraid_smart_errors > 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> disk(s) reporting smart errors."
|
||||
description: "megaraid smart errors."
|
||||
|
||||
- alert: megaraid_status_failed
|
||||
expr: megaraid_status_failed > 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting raid status failed."
|
||||
description: "megaraid status errors."
|
||||
|
||||
- alert: megaraid_other_error_count
|
||||
expr: megaraid_other_error_count > 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting raid controller errors."
|
||||
description: "megaraid other error count."
|
||||
|
||||
- alert: megaraid_exit_status
|
||||
expr: megaraid_exit_status > 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> megaraid exporter status failed."
|
||||
description: "megaraid exit status."
|
||||
|
||||
- alert: adaptec_controller_defunct_disk_drives
|
||||
expr: adaptec_controller_defunct_disk_drives > 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting disk failure."
|
||||
description: "adaptec controller defunct disk drives."
|
||||
|
||||
- alert: adaptec_physical_devices_smart_failed
|
||||
expr: adaptec_physical_devices_smart_failed > 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> physical devices reporting smart errors."
|
||||
description: "adaptec physical devices smart failed."
|
||||
|
||||
- alert: adaptec_physical_devices_smart_warnings
|
||||
expr: adaptec_physical_devices_smart_warnings > 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> physical devices reporting smart warnings."
|
||||
description: "adaptec physical devices smart warnings."
|
||||
|
||||
- alert: adaptec_controller_logical_failed
|
||||
expr: adaptec_controller_logical_failed > 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller logical failure."
|
||||
description: "adaptec controller logical failed."
|
||||
|
||||
- alert: adaptec_controller_status_failed
|
||||
expr: adaptec_controller_status_failed > 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller status failure."
|
||||
description: "adaptec controller status failed."
|
||||
|
||||
- alert: adaptec_controller_temperature_status_failed
|
||||
expr: adaptec_controller_temperature_status_failed > 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller temperatur status failure."
|
||||
description: "adaptec controller temperature status failed"
|
||||
|
||||
- alert: adaptec_logical_degraded
|
||||
expr: adaptec_logical_degraded > 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller logical degraded."
|
||||
description: "adaptec logical degraded."
|
||||
|
||||
- alert: backup_execution
|
||||
expr: (time() - restic_backup_timestamp) / 60 > 180
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting latest backup execution is older then 3 hours."
|
||||
description: "latest backup is older then 90 minutes."
|
||||
|
||||
- alert: backup_duration
|
||||
expr: (restic_cmd_duration_backup + restic_cmd_duration_check + restic_cmd_duration_forget + restic_cmd_duration_mount + restic_cmd_duration_prune + restic_cmd_duration_stats + restic_cmd_duration_umount) / 60 > 60
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
|
||||
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting backup duration took more than 30 minutes."
|
||||
description: "backup duration took more than 60 minutes."
|
||||
@ -0,0 +1,8 @@
|
||||
- targets:
|
||||
{% for target in blackbox_exporter_targets_connect | default([]) %}
|
||||
- {{ target }}
|
||||
{% endfor %}
|
||||
labels:
|
||||
job: 'blackbox-exporter'
|
||||
project: 'connect'
|
||||
env: '{{ stage }}'
|
||||
@ -0,0 +1,8 @@
|
||||
- targets:
|
||||
{% for target in blackbox_exporter_targets | default([]) %}
|
||||
- {{ target }}
|
||||
{% endfor %}
|
||||
labels:
|
||||
job: 'blackbox-exporter'
|
||||
project: 'smardigo'
|
||||
env: '{{ stage }}'
|
||||
@ -0,0 +1,92 @@
|
||||
# my global config
|
||||
global:
|
||||
scrape_interval: 15s # By default, scrape targets every 15 seconds.
|
||||
evaluation_interval: 15s # By default, scrape targets every 15 seconds.
|
||||
# scrape_timeout is set to the global default (10s).
|
||||
|
||||
# Attach these labels to any time series or alerts when communicating with
|
||||
# external systems (federation, remote storage, Alertmanager).
|
||||
external_labels:
|
||||
monitor: '{{ stage_server_name }}'
|
||||
|
||||
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
|
||||
rule_files:
|
||||
- 'alert.rules'
|
||||
|
||||
# alert
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- scheme: http
|
||||
static_configs:
|
||||
- targets:
|
||||
- "{{ service_name }}-alertmanager:9093"
|
||||
|
||||
# A scrape configuration containing exactly one endpoint to scrape:
|
||||
# Here it's Prometheus itself.
|
||||
scrape_configs:
|
||||
|
||||
##############################################
|
||||
### Apllications ####
|
||||
##############################################
|
||||
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: [
|
||||
'{{ service_name }}-prometheus:9090'
|
||||
]
|
||||
labels:
|
||||
env: {{ stage }}
|
||||
project: smardigo
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: instance
|
||||
replacement: '{{ service_prefix }}prometheus.{{ domain }}'
|
||||
|
||||
##############################################
|
||||
|
||||
- job_name: 'connect'
|
||||
scheme: {{ http_s }}
|
||||
metrics_path: '/management/prometheus'
|
||||
static_configs:
|
||||
- targets: [
|
||||
{% for target in prometheus_targets_connect | default([]) %}
|
||||
{{ target }}
|
||||
{% endfor %}
|
||||
{% for host in groups['connect'] | default([]) %}
|
||||
'{{ host }}.{{ domain }}:{{ monitor_port_service }}',
|
||||
{% endfor %}
|
||||
]
|
||||
labels:
|
||||
env: {{ stage }}
|
||||
project: smardigo
|
||||
application: connect
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
regex: (.*):.*
|
||||
target_label: instance
|
||||
replacement: $1
|
||||
|
||||
##############################################
|
||||
### Servers ####
|
||||
##############################################
|
||||
|
||||
- job_name: 'node-exporter'
|
||||
scheme: {{ http_s }}
|
||||
metrics_path: '/metrics'
|
||||
static_configs:
|
||||
- targets: [
|
||||
{% for host in groups['stage_dev'] | default([]) %}
|
||||
'{{ host }}.{{ domain }}:{{ monitor_port_system }}',
|
||||
{% endfor %}
|
||||
]
|
||||
labels:
|
||||
env: {{ stage }}
|
||||
project: smardigo
|
||||
relabel_configs:
|
||||
- source_labels: [job]
|
||||
target_label: job
|
||||
replacement: 'node-exporter'
|
||||
- source_labels: [__address__]
|
||||
regex: (.*):.*
|
||||
target_label: instance
|
||||
replacement: $1
|
||||
Loading…
Reference in New Issue