feat: setup prometheus stack

master
Sven Ketelsen 5 years ago
parent c10d556038
commit 57ff124d00

@ -106,3 +106,12 @@ hetzner_ssh_keys:
#mattermost_hook_smardigo: "< see vault >" #mattermost_hook_smardigo: "< see vault >"
#hetzner_authentication_token: "< see vault >" #hetzner_authentication_token: "< see vault >"
#digitalocean_authentication_token: "< see vault >" #digitalocean_authentication_token: "< see vault >"
#grafana_admin_username: "< see vault >"
#grafana_admin_password: "< see vault >"
#grafana_signing_secret: "< see vault >"
#grafana_user_smardigo_login: "< see vault >"
#grafana_user_smardigo_password: "< see vault >"
#grafana_user_guest_login: "< see vault >"
#grafana_user_guest_password: "< see vault >"

@ -1,30 +1,43 @@
$ANSIBLE_VAULT;1.1;AES256 $ANSIBLE_VAULT;1.1;AES256
37303936393666363130613561666665623364303361613938633232363532313830316231353935 64643166313265363734313932666666643238333366393865343132313835666433326366653337
3335653838653863326334623161336336653435373664310a363561353430356166373766393836 6135633264613662366233323835663034373761663864350a663161316266653238323332656336
31303463343336613665326636643837633362636335343830396438363634656639376463353965 35323166323062323465623933653538356334666230616339313533613431613234653136386230
6131343431653439340a363930643966386237323763613566393235303437376132393865323230 3764333134323538310a353530616532613365326131376664386335336161326638663335326530
63616136386133326131656565306233343831343030313935663764353330653231666533333238 38383166343939316537396332313664313064613561393036353164626566343136623835623237
37343938363431333936613065613633396231306365346435623362653437326536663135333532 62383834376661643537356335646462323962633432336238333033343666326230326639363364
38303838663865623737663866633839333835363864616330626335323338626331316263653233 37666261343733373839613362346166666231663463616436363838626134663861616566663137
39343965666461653538303632636564336338313162663833623365396465336232366236383034 64613666616336303134393161323262386264666232336132376534316461333764363037376531
61393839616164386565373930623338323130626330316565383338623634663338366233613963 63363433363233653839616132373038623436333866326338343130343734323662323137643033
32306431383136346263633364626265363737353363396131313461656139393239306537343435 66313664636565323333646238363339653235346362393435323032656536373838643765313562
34643830373965303339373831393465366565663936663061663434633131303033333436346566 61343437353165666464316135366266623263383033346534666538383566303162393430393761
31376330613939356534613534313335653464616436393137396165326262636538656137326532 39663732636164346633396230336538376236663330323363626132383964323530336338363836
63396138383364646339353539363230306461373732333037663862306161333966313462356363 37333837343836616231643730626134303031376130393431646464646438336334343565326463
36656639346238313839623232373738376530633361373565353063323065626634306532626539 65623237373166303662336362663636333964643866643638666132303862626264353064616163
37393038633761396539353233666563316535323965363233656134393365356339626565656134 39333130396237343431336132383238343535363834356462393430363162643635356363383238
63323864653566646531393830396337353139653831343866303039313631613334313431643161 64626434616365346238626236366232333333366431336436363863316563313462366538366436
39383264646566636538626433333937333230383564316437353464613862316532343564316530 33373264623837346232303131653464376534646438643332626566613735653439646661383536
64623935383037326563613533313361333435326166343339616461386437356238376263356334 38346263346336346137616337363435666466343836656638653638646133303733363365323934
33373166613033626130333962366464663262393134623838633937653837653332393061626637 64393631303163633061393530623535313961313737643638626665303363366439306366373064
66653730396436313339616562626230363231303136333235663534626266613831646631633530 65333563636631373931313837653738356234393036323165663036653565386663313938373430
39356263346231373463373761626430376431633135353939656664613632633965323838633362 34656233326230356262306464323563393066646262613933653032333864326261626330643333
65633335643866633530346236653435343565663936376266663862363130303032323436646133 33636464396263323563626335306432373764353265373833653230333837653363333761666136
66643833653363323935353636343430346561346262383436663838636536386638356438663430 34653035393262623134353361323230323238653034316466663663346462353337613939313238
65363262396339323530303663333730313836346565623430633232366138376261393831643137 61393930373037313266663563386632343262373061333838646531373666383535323065646639
37373734333538313566306631373233353364656438323435373265306531396534656265633532 66326539393061373465613130643761346330623866633263336532663966366339323665323363
31643831353931393139663861346234333233353566333435373338393166376333343235303034 39386539633163653233356439303635646666303662393235316238393934633066373866623230
37376565643162303531396566313531643933376663343663636230376338666565323263666539 34316366656130393738353637626166383563343233383163383639373539626234363265356532
65373832373931393265333432313232633536646331633833613561366532363239326538333565 65303739393637656433656164373934613237336436326630393535633637323865386531646638
3134 63316365623639373332323366373461393766633662396562306534306466653162633131623131
65306334656535383137343830323966346337323363343663326438613562643466643666386537
64326334356561653231346433396439666237626336666239336463333536376130373866343736
32396233333161313230656461396361626435666664616462363036386636396364636364323966
31656130323264363862656461616562613934636636373535343333666565626134376266613937
66393266613635313030356263366235323139663439303861356665333163386334646339613933
62636338343237376630376364323763383562383462613366393738663237643931636161383631
30623834383839613531616435613833636662313664323166363935396231643430376330396431
36366235393933613362303466343433643731363835343862346131343836376132316536633034
32333263313031313464343562633835323663363965373465633433386566313832346639623232
63393832643937396130613638623231663137303832616266326461636164393565336537656437
62393866636233343766633863643532396138636638326531326430613634353564386633343265
33613930356433356139623830326165323632633039333837623136376661303736356661343364
3736353162636662646162333934306562626662633931386565

@ -0,0 +1,5 @@
---
stage: "dev"
alertmanager_channel_smardigo: "#monitoring-qa"

@ -2,7 +2,7 @@
- name: 'apply setup to {{ host | default("all") }}' - name: 'apply setup to {{ host | default("all") }}'
hosts: '{{ host | default("all") }}' hosts: '{{ host | default("all") }}'
serial: "{{ serial_number | default(1) }}" serial: "{{ serial_number | default(5) }}"
gather_facts: no gather_facts: no
become: no become: no

@ -8,20 +8,28 @@
authorization: Bearer {{ digitalocean_authentication_token }} authorization: Bearer {{ digitalocean_authentication_token }}
return_content: yes return_content: yes
register: domain_records_response register: domain_records_response
delegate_to: 127.0.0.1
become: false
- name: Save DNS entry as variable (fact) - name: Save DNS entry as variable (fact)
set_fact: set_fact:
domain_records_response_json: "{{ domain_records_response.json }}" domain_records_response_json: "{{ domain_records_response.json }}"
delegate_to: 127.0.0.1
become: false
- name: Parse DNS entry for {{ record_name }}.{{ domain }} - name: Parse DNS entry for {{ record_name }}.{{ domain }}
set_fact: set_fact:
domain_record: "{{ domain_records_response_json.domain_records | json_query(jmesquery) | first | default({'name': '-', 'ip': '-'}) }}" domain_record: "{{ domain_records_response_json.domain_records | json_query(jmesquery) | first | default({'name': '-', 'ip': '-'}) }}"
vars: vars:
jmesquery: '[*].{id: id, name: name, ip: data}' jmesquery: '[*].{id: id, name: name, ip: data}'
delegate_to: 127.0.0.1
become: false
- name: Print DNS entry for {{ record_name }}.{{ domain }} - name: Print DNS entry for {{ record_name }}.{{ domain }}
debug: debug:
msg: "{{ domain_record }}" msg: "{{ domain_record }}"
delegate_to: 127.0.0.1
become: false
- name: Delete DNS entry for <{{ record_name }}> if necessary - name: Delete DNS entry for <{{ record_name }}> if necessary
uri: uri:
@ -34,6 +42,8 @@
when: when:
domain_record.ip != '-' domain_record.ip != '-'
and record_data != domain_record.ip and record_data != domain_record.ip
delegate_to: 127.0.0.1
become: false
- name: Create DNS entry for <{{ record_name }}> if necessary - name: Create DNS entry for <{{ record_name }}> if necessary
uri: uri:
@ -55,3 +65,5 @@
domain_record.ip == '-' domain_record.ip == '-'
or record_data != domain_record.ip or record_data != domain_record.ip
or record_name != domain_record.name or record_name != domain_record.name
delegate_to: 127.0.0.1
become: false

@ -0,0 +1,133 @@
---
connect_image_name: 'docker.arxes-tolina.de/smardigo/connect-whitelabel-app'
connect_version: 'latest'
connect_admin_username: "connect-admin"
connect_admin_password: "connect-admin"
connect_postgres_version: "12"
connect_postgres_database: "connect-postgres"
connect_postgres_admin_username: "connect-postgres-admin"
connect_postgres_admin_password: "connect-postgres-admin"
connect_mail_properties_base_url: "{{ http_s }}://{{ service_url }}"
connect_mail_properties_base_url_extern: "{{ http_s }}://{{ service_url }}"
connect_id: "{{ service_name }}-connect"
connect_postgres_id: "{{ service_name }}-postgres-connect"
connect_labels: [
'"traefik.enable=true"',
'"traefik.http.routers.{{ connect_id }}.service={{ connect_id }}"',
'"traefik.http.routers.{{ connect_id }}.rule=Host(`{{ service_url }}`)"',
'"traefik.http.routers.{{ connect_id }}.entrypoints=websecure"',
'"traefik.http.routers.{{ connect_id }}.tls=true"',
'"traefik.http.routers.{{ connect_id }}.tls.certresolver=letsencrypt"',
'"traefik.http.services.{{ connect_id }}.loadbalancer.server.port={{ service_port }}"',
'"traefik.http.routers.{{ connect_id }}-admin.service={{ connect_id }}-admin"',
'"traefik.http.routers.{{ connect_id }}-admin.rule=Host(`{{ service_url }}`)"',
'"traefik.http.routers.{{ connect_id }}-admin.entrypoints=admin-service"',
'"traefik.http.routers.{{ connect_id }}-admin.tls=true"',
'"traefik.http.routers.{{ connect_id }}-admin.tls.certresolver=letsencrypt"',
'"traefik.http.routers.{{ connect_id }}-admin.middlewares={{ connect_id }}-admin-cors"',
'"traefik.http.middlewares.{{ connect_id }}-admin-cors.headers.accesscontrolallowmethods=GET,OPTIONS"',
'"traefik.http.middlewares.{{ connect_id }}-admin-cors.headers.accesscontrolalloworigin=*"',
'"traefik.http.middlewares.{{ connect_id }}-admin-cors.headers.accesscontrolallowheaders=SMA_USER"',
'"traefik.http.services.{{ connect_id }}-admin.loadbalancer.server.port={{ management_port }}"',
'"traefik.http.routers.{{ connect_id }}-monitor.service=node-exporter"',
'"traefik.http.routers.{{ connect_id }}-monitor.rule=Host(`{{ service_url }}`)"',
'"traefik.http.routers.{{ connect_id }}-monitor.entrypoints=admin-system"',
'"traefik.http.routers.{{ connect_id }}-monitor.tls=true"',
'"traefik.http.routers.{{ connect_id }}-monitor.tls.certresolver=letsencrypt"',
]
connect_docker: {
networks: [
{
name: back-tier,
external: true,
},
{
name: front-tier,
external: true,
},
],
volumes: [
{
name: "{{ connect_postgres_id }}-data"
}
],
services: [
{
name: "{{ connect_id }}",
image_name: "{{ connect_image_name }}",
image_version: "{{ connect_version }}",
labels: "{{ connect_labels + ( connect_labels_additional | default([])) }}",
restart: "{{ connect_service_restart | default('always') }}",
environment: [
"ADMIN_LOGIN: \"{{ connect_admin_username }}\"",
"ADMIN_PASSWORD: \"{{ connect_admin_password }}\"",
"DATASOURCE_URL: \"jdbc:postgresql://{{ connect_postgres_id }}:{{ service_port_postgres }}/{{ connect_postgres_database }}\"",
"DATASOURCE_USERNAME: \"{{ connect_postgres_admin_username }}\"",
"DATASOURCE_PASSWORD: \"{{ connect_postgres_admin_password }}\"",
"MAIL_PROTOCOL: \"{{ connect_mail_protocol | default('smtp') }}\"",
"MAIL_HOST: \"{{ connect_mail_host | default('smtp.tolina.local') }}\"",
"MAIL_PORT: \"{{ connect_mail_port | default('25') }}\"",
"MAIL_USER: \"{{ connect_mail_user | default('') }}\"",
"MAIL_PASSWORD: \"{{ connect_mail_password | default('') }}\"",
"MAIL_PROPERTIES_SIMULATION: \"{{ connect_mail_properties_simulation | default('true') }}\"",
"MAIL_PROPERTIES_BASE_URL: \"{{ connect_mail_properties_base_url }}\"",
"MAIL_PROPERTIES_BASE_URL_EXTERN: \"{{ connect_mail_properties_base_url_extern }}\"",
"MAIL_PROPERTIES_SENDER: \"{{ connect_mail_properties_sender | default('noreply-connect@arxes-tolina.de') }}\"",
"MAIL_PROPERTIES_SENDER_ALIAS: \"{{ connect_mail_properties_sender_alias | default('noreply-connect') }}\"",
"AUTH_MODULE: \"{{ connect_auth_module | default('preauth') }}\"",
"OIDC_CLIENT_ID: \"{{ connect_oidc_client_id | default('oidc_config_not_found') }}\"",
"OIDC_CLIENT_SECRET: \"{{ connect_oidc_client_secret | default('oidc_config_not_found') }}\"",
"OIDC_REGISTRATION_ID: \"{{ connect_oidc_registration_id | default('oidc_config_not_found') }}\"",
"OIDC_ISSUER_URI: \"{{ connect_oidc_issuer_uri | default('oidc_config_not_found') }}\"",
"PASSWORD_CHANGE_URL: \"{{ connect_password_change_url | default('') }}\"",
"USER_MANAGEMENT_URL: \"{{ connect_iam_user_management_url | default('') }}\"",
"IAM_MODULE: \"{{ connect_iam_module | default('embedded') }}\"",
"IAM_CLIENT_ENABLED: \"{{ smardigo_iam_client_enabled | default('false') }}\"",
"EXTERNAL_IAM_SERVER_URL: \"{{ smardigo_iam_client_server_url | default('') }}\"",
"SMA_API_TOKEN_SECRET: \"{{ connect_api_token_secret | default('') }}\"",
"SMA_CSRF_TOKEN_NAME: \"{{ connect_csrf_token_name | default('') }}\"",
"SMA_CSRF_TOKEN_VALUE: \"{{ connect_csrf_token_value | default('') }}\"",
"SPRING_PROFILES_INCLUDE: \"{{ spring_profiles_include | default('swagger') }}\"",
"RIBBON_DISPLAY_ON_ACTIVE_PROFILES: \"{{ ribbon_display_on_active_profiles | default('dev') }}\"",
],
networks: [
'"back-tier"',
'"front-tier"',
],
extra_hosts: "{{ connect_extra_hosts | default([]) }}",
},
{
name: "{{ connect_postgres_id }}",
image_name: "postgres",
image_version: "{{ connect_postgres_version }}",
environment: [
'POSTGRES_DB: "{{ connect_postgres_database }}"',
'POSTGRES_USER: "{{ connect_postgres_admin_username }}"',
'POSTGRES_PASSWORD: "{{ connect_postgres_admin_password }}"',
],
volumes: [
'"{{ connect_postgres_id }}-data:/var/lib/postgresql/data"',
],
networks: [
'"back-tier"',
],
ports: "{{ connect_postgres_ports | default([]) }}",
},
],
}

@ -0,0 +1,188 @@
---
### tags:
### create_users
### update_deployment
- name: "Send mattermost message"
uri:
url: "{{ mattermost_hook_smardigo }}"
method: POST
body: "{{ lookup('template','mattermost-deploy-start.json.j2') }}"
body_format: json
headers:
Content-Type: "application/json"
delegate_to: 127.0.0.1
become: false
when:
- send_status_messages
- name: "Setup DNS configuration for {{ service_name }}"
include_role:
name: _digitalocean
tasks_from: domain
vars:
record_data: "{{ stage_server_ip }}"
record_name: "{{ service_name }}"
- name: "Setup public DNS configuration for {{ service_name }}"
include_role:
name: _digitalocean
tasks_from: domain
vars:
record_data: "{{ item.ip }}"
record_name: "{{ item.name }}"
loop: "{{ connect_public_dns_entries }}"
when: connect_public_dns_entries is defined
- name: "Check docker networks"
include_role:
name: _docker
tasks_from: networks
- name: "Check if {{ service_name }}/docker-compose.yml exists"
stat:
path: '{{ service_base_path }}/{{ service_name }}/docker-compose.yml'
register: check_docker_compose_file
tags:
- update_deployment
- name: "Stop {{ service_name }}"
shell: docker-compose down
args:
chdir: '{{ service_base_path }}/{{ service_name }}'
when: check_docker_compose_file.stat.exists
ignore_errors: yes
tags:
- update_deployment
- name: "Deploy service configuration for {{ service_name }}"
include_role:
name: _deploy
tasks_from: configs
vars:
current_config: "connect"
current_base_path: "{{ service_base_path }}"
current_destination: "{{ service_name }}"
current_owner: "{{ docker_owner }}"
current_group: "{{ docker_group }}"
current_docker: "{{ connect_docker }}"
- name: "Update {{ service_name }}"
shell: docker-compose pull
args:
chdir: '{{ service_base_path }}/{{ service_name }}'
tags:
- update_deployment
- name: "Start {{ service_name }}"
shell: docker-compose up -d
args:
chdir: '{{ service_base_path }}/{{ service_name }}'
tags:
- update_deployment
- name: "Update caddy configuration for {{ service_name }}"
include_role:
name: _deploy
tasks_from: caddy_config
vars:
current_service: "{{ service_name }}"
current_services: [
{
external: "{{ service_url }}",
internal: "{{ service_name }}-connect:{{ service_port }}",
},
{
external: "{{ service_url }}:{{ monitor_port_service }}",
internal: "{{ service_name }}-connect:{{ management_port }}",
},
{
external: "{{ service_url }}:{{ monitor_port_system }}",
internal: "node-exporter:{{ service_port_node_exporter }}",
},
]
- name: "Update public caddy configuration for {{ service_name }}"
include_role:
name: _deploy
tasks_from: caddy_config
vars:
current_service: "{{ service_name }}-{{ item.name }}"
current_services: [
{
external: "{{ item.name }}.{{ domain }}",
internal: "{{ service_name }}-connect:{{ service_port }}",
}
]
loop: "{{ connect_public_dns_entries }}"
when: connect_public_dns_entries is defined
- name: "Update landing page entries for {{ service_name }}"
include_role:
name: _deploy
tasks_from: caddy_landing_page
vars:
current_services: [
{
current_name: "{{ service_name }}",
current_url: "{{ http_s }}://{{ service_url }}",
current_version: "{{ connect_version }}",
current_date: "{{ ansible_date_time.iso8601 }}",
management: "{{ http_s }}://{{ service_url }}:{{ monitor_port_service }}/management",
},
]
tags:
- update_deployment
- name: "Update landing page with public entries {{ service_name }}"
include_role:
name: _deploy
tasks_from: caddy_landing_page
vars:
current_services: [
{
current_name: "{{ item.name }}",
current_url: "{{ http_s }}://{{ item.name }}.{{ domain }}",
current_version: "{{ connect_version }}",
current_date: "{{ ansible_date_time.iso8601 }}",
management: "{{ http_s }}://{{ service_url }}:{{ monitor_port_service }}/management",
},
]
loop: "{{ connect_public_dns_entries }}"
when: connect_public_dns_entries is defined
tags:
- update_deployment
- name: "Update landing page with extra entries for {{ service_name }}"
include_role:
name: _deploy
tasks_from: caddy_landing_page
vars:
current_services: [
{
current_name: "{{ item.name }}",
current_url: "{{ item.domain }}",
current_version: "{{ connect_version }}",
current_date: "{{ ansible_date_time.iso8601 }}",
management: "{{ http_s }}://{{ service_url }}:{{ monitor_port_service }}/management",
},
]
loop: "{{ connect_extra_domain_entries }}"
when: connect_extra_domain_entries is defined
tags:
- update_deployment
- name: "Send mattermost messsge"
uri:
url: "{{ mattermost_hook_smardigo }}"
method: POST
body: "{{ lookup('template','mattermost-deploy-end.json.j2') }}"
body_format: json
headers:
Content-Type: "application/json"
delegate_to: 127.0.0.1
become: false
when:
- send_status_messages

@ -0,0 +1,5 @@
{
"id": "{{ current_user.userId }}",
"firstName": "{{ current_user.firstName | default('null') }}",
"lastName": "{{ current_user.lastName | default('null') }}"
}

@ -0,0 +1,132 @@
---
grafana_version: "7.0.5"
prometheus_version: "v2.19.2"
alertmanager_version: "v0.21.0"
blackbox_exporter_version: "v0.17.0"
postgres_exporter_version: "v0.8.0"
service_port_grafana: 3000
service_port_prometheus: 9090
service_port_alertmanager: 9093
service_port_blackbox_exporter: 9115
service_port_postgres_exporter: 9187
prometheus_id: "{{ service_name }}-prometheus"
alertmanager_id: "{{ service_name }}-alertmanager"
grafana_id: "{{ service_name }}-grafana"
prometheus_docker: {
networks: [
{
name: back-tier,
external: true,
},
{
name: front-tier,
external: true,
},
],
volumes: [
{
name: "{{ prometheus_id }}-data"
},
{
name: "{{ alertmanager_id }}-data"
},
{
name: "{{ grafana_id }}-data"
},
],
services: [
{
name: "{{ prometheus_id }}",
image_name: "prom/prometheus",
image_version: "{{ prometheus_version }}",
labels: [
'"traefik.enable=true"',
'"traefik.http.routers.{{ prometheus_id }}.service={{ prometheus_id }}"',
'"traefik.http.routers.{{ prometheus_id }}.rule=Host(`{{ service_name }}-prometheus.{{ domain }}`)"',
'"traefik.http.routers.{{ prometheus_id }}.entrypoints=websecure"',
'"traefik.http.routers.{{ prometheus_id }}.tls=true"',
'"traefik.http.routers.{{ prometheus_id }}.tls.certresolver=letsencrypt"',
'"traefik.http.services.{{ prometheus_id }}.loadbalancer.server.port={{ service_port_prometheus }}"',
],
command: [
'"--config.file=/etc/prometheus/prometheus.yml"',
'"--storage.tsdb.path=/prometheus"',
'"--web.console.libraries=/usr/share/prometheus/console_libraries"',
'"--web.console.templates=/usr/share/prometheus/consoles"',
'"--web.external-url={{ http_s}}://{{ service_name }}-prometheus.{{ domain }}"',
'"--web.enable-lifecycle"',
'"--storage.tsdb.retention.time=30w"',
],
volumes: [
'"./config/prometheus/:/etc/prometheus/:ro"',
'"{{ prometheus_id }}-data:/prometheus"',
],
networks: [
'"back-tier"',
'"front-tier"',
],
extra_hosts: "{{ prometheus_extra_hosts | default([]) }}",
},
{
name: "{{ alertmanager_id }}",
image_name: "prom/alertmanager",
image_version: "{{ alertmanager_version }}",
labels: [
'"traefik.enable=true"',
'"traefik.http.routers.{{ alertmanager_id }}.service={{ alertmanager_id }}"',
'"traefik.http.routers.{{ alertmanager_id }}.rule=Host(`{{ service_name }}-alertmanager.{{ domain }}`)"',
'"traefik.http.routers.{{ alertmanager_id }}.entrypoints=websecure"',
'"traefik.http.routers.{{ alertmanager_id }}.tls=true"',
'"traefik.http.routers.{{ alertmanager_id }}.tls.certresolver=letsencrypt"',
'"traefik.http.services.{{ alertmanager_id }}.loadbalancer.server.port={{ service_port_alertmanager }}"',
],
command: [
'"--config.file=/etc/alertmanager/config.yml"',
'"--storage.path=/alertmanager"',
'"--web.external-url={{ http_s}}://{{ service_name }}-alertmanager.{{ domain }}"',
],
environment: [
'LS_JAVA_OPTS: "-Xmx1G -Xms1G"',
],
volumes: [
'"./config/alertmanager/:/etc/alertmanager/:ro"',
'"{{ alertmanager_id }}-data:/alertmanager"',
],
networks: [
'"back-tier"',
'"front-tier"',
],
},
{
name: "{{ grafana_id }}",
image_name: "grafana/grafana",
image_version: "{{ grafana_version }}",
user: '"472"',
labels: [
'"traefik.enable=true"',
'"traefik.http.routers.{{ grafana_id }}.service={{ grafana_id }}"',
'"traefik.http.routers.{{ grafana_id }}.rule=Host(`{{ service_name }}-grafana.{{ domain }}`)"',
'"traefik.http.routers.{{ grafana_id }}.entrypoints=websecure"',
'"traefik.http.routers.{{ grafana_id }}.tls=true"',
'"traefik.http.routers.{{ grafana_id }}.tls.certresolver=letsencrypt"',
'"traefik.http.services.{{ grafana_id }}.loadbalancer.server.port={{ service_port_grafana }}"',
],
volumes: [
'"./config/grafana/provisioning/:/etc/grafana/provisioning/"',
'"./config/grafana/conf/defaults.ini:/usr/share/grafana/conf/defaults.ini"',
'"{{ grafana_id }}-data:/var/lib/grafana"',
],
networks: [
'"back-tier"',
'"front-tier"',
],
env_file: [
'"./config/grafana/config.monitoring"',
],
}
],
}

@ -0,0 +1,184 @@
---
### tags:
### update_config
- name: "Send mattermost messsge"
uri:
url: "{{ mattermost_hook_smardigo }}"
method: POST
body: "{{ lookup('template','mattermost-deploy-start.json.j2') }}"
body_format: json
headers:
Content-Type: "application/json"
delegate_to: 127.0.0.1
become: false
when:
- send_status_messages
- name: Gather current server infos
hcloud_server_info:
api_token: "{{ hetzner_authentication_token }}"
register: hetzner_server_infos
delegate_to: 127.0.0.1
become: false
- name: Save current server infos as variable (fact)
set_fact:
hetzner_server_infos_json: "{{ hetzner_server_infos.hcloud_server_info }}"
delegate_to: 127.0.0.1
become: false
- name: Read ip for {{ inventory_hostname }}
set_fact:
stage_server_ip: "{{ item.ipv4_address }}"
when: item.name == inventory_hostname
with_items: "{{ hetzner_server_infos_json }}"
delegate_to: 127.0.0.1
become: false
- name: "Setup DNS configuration for {{ service_name }} prometheus"
include_role:
name: _digitalocean
tasks_from: domain
vars:
record_data: "{{ stage_server_ip }}"
record_name: "{{ service_name }}-prometheus"
- name: "Setup DNS configuration for {{ service_name }} grafana"
include_role:
name: _digitalocean
tasks_from: domain
vars:
record_data: "{{ stage_server_ip }}"
record_name: "{{ service_name }}-grafana"
- name: "Setup DNS configuration for {{ service_name }} alertmanager"
include_role:
name: _digitalocean
tasks_from: domain
vars:
record_data: "{{ stage_server_ip }}"
record_name: "{{ service_name }}-alertmanager"
- name: "Check docker networks"
include_role:
name: _docker
tasks_from: networks
- name: "Check if {{ service_name }}/docker-compose.yml exists"
stat:
path: '{{ service_base_path }}/{{ service_name }}/docker-compose.yml'
register: check_docker_compose_file
tags:
- update_config
- name: "Stop {{ service_name }}"
shell: docker-compose down
args:
chdir: '{{ service_base_path }}/{{ service_name }}'
when: check_docker_compose_file.stat.exists
ignore_errors: yes
- name: "Deploy service configuration for {{ service_name }}"
include_role:
name: _deploy
tasks_from: configs
vars:
current_config: "prometheus"
current_base_path: "{{ service_base_path }}"
current_destination: "{{ service_name }}"
current_owner: "{{ docker_owner }}"
current_group: "{{ docker_group }}"
current_docker: "{{ prometheus_docker }}"
tags:
- update_config
- name: "Start {{ service_name }}"
shell: |
docker-compose down
docker-compose up -d
args:
chdir: '{{ service_base_path }}/{{ service_name }}'
tags:
- update_config
- name: "Update landing page for {{ service_name }}"
include_role:
name: _deploy
tasks_from: caddy_landing_page
vars:
current_services: [
{
current_name: "{{ service_prefix }}prometheus",
current_url: "{{ http_s}}://{{ service_name }}-prometheus.{{ domain }}",
current_version: "{{ prometheus_version }}",
current_date: "{{ ansible_date_time.iso8601 }}",
},
{
current_name: "{{ service_prefix }}grafana",
current_url: "{{ http_s }}://{{ service_name }}-grafana.{{ domain }}",
current_version: "{{ grafana_version }}",
current_date: "{{ ansible_date_time.iso8601 }}",
},
{
current_name: "{{ service_prefix }}alertmanager",
current_url: "{{ http_s }}://{{ service_name }}-alertmanager.{{ domain }}",
current_version: "{{ alertmanager_version }}",
current_date: "{{ ansible_date_time.iso8601 }}",
},
]
- name: "Wait for {{ http_s }}://{{ service_name }}-grafana.{{ domain }}"
uri:
url: "{{ http_s }}://{{ service_name }}-grafana.{{ domain }}/api/admin/stats"
url_username: "{{ grafana_admin_username }}"
url_password: "{{ grafana_admin_password }}"
force_basic_auth: yes
method: GET
status_code: 200
return_content: yes
register: grafana_stats
until: grafana_stats.status == 200
retries: 10
delay: 60
- name: Create grafana users
uri:
url: "{{ http_s }}://{{ service_name }}-grafana.{{ domain }}/api/admin/users"
url_username: "{{ grafana_admin_username }}"
url_password: "{{ grafana_admin_password }}"
force_basic_auth: yes
method: POST
status_code: 200
body_format: json
body: "{\"name\":\"{{ item.name }}\", \"email\":\"{{ item.email }}\", \"login\":\"{{ item.login }}\", \"password\":\"{{ item.password }}\" }"
headers:
Content-Type: application/json
loop:
- {
name: "{{ grafana_user_smardigo_login }}",
email: "smardigo@arxes-tolina.de",
login: "{{ grafana_user_smardigo_login }}",
password: "{{ grafana_user_smardigo_password }}"
}
- {
name: "{{ grafana_user_guest_login }}",
email: "grafana-guest@arxes-tolina.de",
login: "{{ grafana_user_guest_login }}",
password: "{{ grafana_user_guest_password }}"
}
when: grafana_stats.json.users == 1
- name: "Send mattermost messsge"
uri:
url: "{{ mattermost_hook_smardigo }}"
method: POST
body: "{{ lookup('template','mattermost-deploy-end.json.j2') }}"
body_format: json
headers:
Content-Type: "application/json"
delegate_to: 127.0.0.1
become: false
when:
- send_status_messages

@ -0,0 +1,16 @@
{% for database in databases %}
{
name: "{{ service_name }}-postgres-exporter-{{ database.name }}",
image_name: "wrouesnel/postgres_exporter",
image_version: "{{ postgres_exporter_version }}",
environment: [
'DATA_SOURCE_URI: "{{ database.uri }}"',
"{{ database.user }}",
"{{ database.pass }}",
],
networks: [
'"back-tier"',
'"front-tier"',
],
}
{% endfor %}

@ -13,5 +13,7 @@
msg: "The ansible version has to be at least ({{ ansible_version.full }})" msg: "The ansible version has to be at least ({{ ansible_version.full }})"
roles: roles:
- role: docker-registry - role: connect
when: "'docker_registry' in group_names" when: "'connect' in group_names"
- role: prometheus
when: "'prometheus' in group_names"

@ -1,20 +1,27 @@
[hcloud] [connect]
dev-connect-01
dev-connect-02
dev-connect-03
[docker_registry]
dev-docker-registry-01 dev-docker-registry-01
[elastic]
dev-elastic-stack-01 dev-elastic-stack-01
dev-elastic-stack-02 dev-elastic-stack-02
dev-elastic-stack-03 dev-elastic-stack-03
dev-prometheus-01
[docker_registry]
dev-docker-registry-01
[prometheus] [prometheus]
dev-prometheus-01 dev-prometheus-01
[stage_dev:children] [stage_dev:children]
hcloud connect
docker_registry docker_registry
elastic
prometheus prometheus
[all:children] [all:children]
stage_dev stage_dev
[hcloud:children]
stage_dev

@ -0,0 +1,20 @@
route:
receiver: smardigo #default
routes:
- match:
project: 'smardigo'
env: '{{ stage }}'
receiver: 'smardigo'
receivers:
- name: 'smardigo'
slack_configs:
- api_url: '{{ mattermost_hook_smardigo }}'
username: 'prometheus'
channel: '{{ alertmanager_channel_smardigo }}'
title: '{{ '{{' }} template "custom_title" . {{ '}}' }}'
text: '{{ '{{' }} template "custom_slack_message" . {{ '}}' }}'
send_resolved: true
templates:
- /etc/alertmanager/templates/notifications.tmpl

@ -0,0 +1,18 @@
{{ define "__single_message_title" }}{{ range .Alerts.Firing }}{{ .Labels.alertname }} @ {{ .Annotations.identifier }}{{ end }}{{ range .Alerts.Resolved }}{{ .Labels.alertname }} @ {{ .Annotations.identifier }}{{ end }}{{ end }}
{{ define "custom_title" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ if or (and (eq (len .Alerts.Firing) 1) (eq (len .Alerts.Resolved) 0)) (and (eq (len .Alerts.Firing) 0) (eq (len .Alerts.Resolved) 1)) }}{{ template "__single_message_title" . }}{{ end }}{{ end }}
{{ define "custom_slack_message" }}
{{ if or (and (eq (len .Alerts.Firing) 1) (eq (len .Alerts.Resolved) 0)) (and (eq (len .Alerts.Firing) 0) (eq (len .Alerts.Resolved) 1)) }}
{{ range .Alerts.Firing }}{{ .Annotations.description }}{{ end }}{{ range .Alerts.Resolved }}{{ .Annotations.description }}{{ end }}
{{ else }}
{{ if gt (len .Alerts.Firing) 0 }}
*Alerts Firing:*
{{ range .Alerts.Firing }}- {{ .Annotations.summary }}
{{ end }}{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
*Alerts Resolved:*
{{ range .Alerts.Resolved }}- {{ .Annotations.summary }}
{{ end }}{{ end }}
{{ end }}
{{ end }}

@ -0,0 +1,12 @@
modules:
http_200:
prober: http
timeout: 5s
http:
method: GET
fail_if_ssl: false
fail_if_not_ssl: false
tls_config:
insecure_skip_verify: false
preferred_ip_protocol: "ip4"
valid_status_codes: [200]

@ -0,0 +1,770 @@
##################### Grafana Configuration Defaults #####################
#
# Do not modify this file in grafana installs
#
# possible values : production, development
app_mode = production
# instance name, defaults to HOSTNAME environment variable value or hostname if HOSTNAME var is empty
instance_name = ${HOSTNAME}
#################################### Paths ###############################
[paths]
# Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used)
data = data
# Temporary files in `data` directory older than given duration will be removed
temp_data_lifetime = 24h
# Directory where grafana can store logs
logs = data/log
# Directory where grafana will automatically scan and look for plugins
plugins = data/plugins
# folder that contains provisioning config files that grafana will apply on startup and while running.
provisioning = conf/provisioning
#################################### Server ##############################
[server]
# Protocol (http, https, h2, socket)
protocol = http
# The ip address to bind to, empty will bind to all interfaces
http_addr =
# The http port to use
http_port = 3000
# The public facing domain name used to access grafana from a browser
domain = localhost
# Redirect to correct domain if host header does not match domain
# Prevents DNS rebinding attacks
enforce_domain = false
# The full public facing url
root_url = %(protocol)s://%(domain)s:%(http_port)s/
# Serve Grafana from subpath specified in `root_url` setting. By default it is set to `false` for compatibility reasons.
serve_from_sub_path = false
# Log web requests
router_logging = false
# the path relative working path
static_root_path = public
# enable gzip
enable_gzip = false
# https certs & key file
cert_file =
cert_key =
# Unix socket path
socket = /tmp/grafana.sock
#################################### Database ############################
[database]
# You can configure the database connection by specifying type, host, name, user and password
# as separate properties or as on string using the url property.
# Either "mysql", "postgres" or "sqlite3", it's your choice
type = sqlite3
host = 127.0.0.1:3306
name = grafana
user = root
# If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;"""
password =
# Use either URL or the previous fields to configure the database
# Example: mysql://user:secret@host:port/database
url =
# Max idle conn setting default is 2
max_idle_conn = 2
# Max conn setting default is 0 (mean not set)
max_open_conn =
# Connection Max Lifetime default is 14400 (means 14400 seconds or 4 hours)
conn_max_lifetime = 14400
# Set to true to log the sql calls and execution times.
log_queries =
# For "postgres", use either "disable", "require" or "verify-full"
# For "mysql", use either "true", "false", or "skip-verify".
ssl_mode = disable
ca_cert_path =
client_key_path =
client_cert_path =
server_cert_name =
# For "sqlite3" only, path relative to data_path setting
path = grafana.db
# For "sqlite3" only. cache mode setting used for connecting to the database
cache_mode = private
#################################### Cache server #############################
[remote_cache]
# Either "redis", "memcached" or "database" default is "database"
type = database
# cache connectionstring options
# database: will use Grafana primary database.
# redis: config like redis server e.g. `addr=127.0.0.1:6379,pool_size=100,db=0,ssl=false`. Only addr is required. ssl may be 'true', 'false', or 'insecure'.
# memcache: 127.0.0.1:11211
connstr =
#################################### Data proxy ###########################
[dataproxy]
# This enables data proxy logging, default is false
logging = false
# How long the data proxy waits before timing out, default is 30 seconds.
# This setting also applies to core backend HTTP data sources where query requests use an HTTP client with timeout set.
timeout = 30
# If enabled and user is not anonymous, data proxy will add X-Grafana-User header with username into the request, default is false.
send_user_header = false
#################################### Analytics ###########################
[analytics]
# Server reporting, sends usage counters to stats.grafana.org every 24 hours.
# No ip addresses are being tracked, only simple counters to track
# running instances, dashboard and error counts. It is very helpful to us.
# Change this option to false to disable reporting.
reporting_enabled = true
# Set to false to disable all checks to https://grafana.com
# for new versions (grafana itself and plugins), check is used
# in some UI views to notify that grafana or plugin update exists
# This option does not cause any auto updates, nor send any information
# only a GET request to https://grafana.com to get latest versions
check_for_updates = true
# Google Analytics universal tracking code, only enabled if you specify an id here
google_analytics_ua_id =
# Google Tag Manager ID, only enabled if you specify an id here
google_tag_manager_id =
#################################### Security ############################
[security]
# disable creation of admin user on first start of grafana
disable_initial_admin_creation = false
# default admin user, created on startup
admin_user = {{ grafana_admin_username }}
# default admin password, can be changed before first start of grafana, or in profile settings
admin_password = {{ grafana_admin_password }}
# used for signing
secret_key = {{ grafana_signing_secret }}
# disable gravatar profile images
disable_gravatar = false
# data source proxy whitelist (ip_or_domain:port separated by spaces)
data_source_proxy_whitelist =
# disable protection against brute force login attempts
disable_brute_force_login_protection = false
# set to true if you host Grafana behind HTTPS. default is false.
cookie_secure = false
# set cookie SameSite attribute. defaults to `lax`. can be set to "lax", "strict", "none" and "disabled"
cookie_samesite = lax
# set to true if you want to allow browsers to render Grafana in a <frame>, <iframe>, <embed> or <object>. default is false.
allow_embedding = false
# Set to true if you want to enable http strict transport security (HSTS) response header.
# This is only sent when HTTPS is enabled in this configuration.
# HSTS tells browsers that the site should only be accessed using HTTPS.
# The default will change to true in the next minor release, 6.3.
strict_transport_security = false
# Sets how long a browser should cache HSTS. Only applied if strict_transport_security is enabled.
strict_transport_security_max_age_seconds = 86400
# Set to true if to enable HSTS preloading option. Only applied if strict_transport_security is enabled.
strict_transport_security_preload = false
# Set to true if to enable the HSTS includeSubDomains option. Only applied if strict_transport_security is enabled.
strict_transport_security_subdomains = false
# Set to true to enable the X-Content-Type-Options response header.
# The X-Content-Type-Options response HTTP header is a marker used by the server to indicate that the MIME types advertised
# in the Content-Type headers should not be changed and be followed. The default will change to true in the next minor release, 6.3.
x_content_type_options = false
# Set to true to enable the X-XSS-Protection header, which tells browsers to stop pages from loading
# when they detect reflected cross-site scripting (XSS) attacks. The default will change to true in the next minor release, 6.3.
x_xss_protection = false
#################################### Snapshots ###########################
[snapshots]
# snapshot sharing options
external_enabled = true
external_snapshot_url = https://snapshots-origin.raintank.io
external_snapshot_name = Publish to snapshot.raintank.io
# Set to true to enable this Grafana instance act as an external snapshot server and allow unauthenticated requests for
# creating and deleting snapshots.
public_mode = false
# remove expired snapshot
snapshot_remove_expired = true
#################################### Dashboards ##################
[dashboards]
# Number dashboard versions to keep (per dashboard). Default: 20, Minimum: 1
versions_to_keep = 20
# Minimum dashboard refresh interval. When set, this will restrict users to set the refresh interval of a dashboard lower than given interval. Per default this is 5 seconds.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
min_refresh_interval = 5s
# Path to the default home dashboard. If this value is empty, then Grafana uses StaticRootPath + "dashboards/home.json"
default_home_dashboard_path =
#################################### Users ###############################
[users]
# disable user signup / registration
allow_sign_up = false
# Allow non admin users to create organizations
allow_org_create = false
# Set to true to automatically assign new users to the default organization (id 1)
auto_assign_org = true
# Set this value to automatically add new users to the provided organization (if auto_assign_org above is set to true)
auto_assign_org_id = 1
# Default role new users will be automatically assigned (if auto_assign_org above is set to true)
auto_assign_org_role = Viewer
# Require email validation before sign up completes
verify_email_enabled = false
# Background text for the user field on the login page
login_hint = email or username
password_hint = password
# Default UI theme ("dark" or "light")
default_theme = dark
# External user management
external_manage_link_url =
external_manage_link_name =
external_manage_info =
# Viewers can edit/inspect dashboard settings in the browser. But not save the dashboard.
viewers_can_edit = false
# Editors can administrate dashboard, folders and teams they create
editors_can_admin = false
[auth]
# Login cookie name
login_cookie_name = grafana_session
# The lifetime (days) an authenticated user can be inactive before being required to login at next visit. Default is 7 days.
login_maximum_inactive_lifetime_days = 7
# The maximum lifetime (days) an authenticated user can be logged in since login time before being required to login. Default is 30 days.
login_maximum_lifetime_days = 30
# How often should auth tokens be rotated for authenticated users when being active. The default is each 10 minutes.
token_rotation_interval_minutes = 10
# Set to true to disable (hide) the login form, useful if you use OAuth
disable_login_form = false
# Set to true to disable the signout link in the side menu. useful if you use auth.proxy
disable_signout_menu = false
# URL to redirect the user to after sign out
signout_redirect_url =
# Set to true to attempt login with OAuth automatically, skipping the login screen.
# This setting is ignored if multiple OAuth providers are configured.
oauth_auto_login = false
# OAuth state max age cookie duration. Defaults to 60 seconds.
oauth_state_cookie_max_age = 60
# limit of api_key seconds to live before expiration
api_key_max_seconds_to_live = -1
#################################### Anonymous Auth ######################
[auth.anonymous]
# enable anonymous access
enabled = false
# specify organization name that should be used for unauthenticated users
org_name = Main Org.
# specify role for unauthenticated users
org_role = Viewer
#################################### Github Auth #########################
[auth.github]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret = some_secret
scopes = user:email,read:org
auth_url = https://github.com/login/oauth/authorize
token_url = https://github.com/login/oauth/access_token
api_url = https://api.github.com/user
allowed_domains =
team_ids =
allowed_organizations =
#################################### GitLab Auth #########################
[auth.gitlab]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret = some_secret
scopes = api
auth_url = https://gitlab.com/oauth/authorize
token_url = https://gitlab.com/oauth/token
api_url = https://gitlab.com/api/v4
allowed_domains =
allowed_groups =
#################################### Google Auth #########################
[auth.google]
enabled = false
allow_sign_up = true
client_id = some_client_id
client_secret = some_client_secret
scopes = https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email
auth_url = https://accounts.google.com/o/oauth2/auth
token_url = https://accounts.google.com/o/oauth2/token
api_url = https://www.googleapis.com/oauth2/v1/userinfo
allowed_domains =
hosted_domain =
#################################### Grafana.com Auth ####################
# legacy key names (so they work in env variables)
[auth.grafananet]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret = some_secret
scopes = user:email
allowed_organizations =
[auth.grafana_com]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret = some_secret
scopes = user:email
allowed_organizations =
#################################### Azure AD OAuth #######################
[auth.azuread]
name = Azure AD
enabled = false
allow_sign_up = true
client_id = some_client_id
client_secret = some_client_secret
scopes = openid email profile
auth_url = https://login.microsoftonline.com/<tenant-id>/oauth2/v2.0/authorize
token_url = https://login.microsoftonline.com/<tenant-id>/oauth2/v2.0/token
allowed_domains =
allowed_groups =
#################################### Okta OAuth #######################
[auth.okta]
name = Okta
enabled = false
allow_sign_up = true
client_id = some_id
client_secret = some_secret
scopes = openid profile email groups
auth_url = https://<tenant-id>.okta.com/oauth2/v1/authorize
token_url = https://<tenant-id>.okta.com/oauth2/v1/token
api_url = https://<tenant-id>.okta.com/oauth2/v1/userinfo
allowed_domains =
allowed_groups =
role_attribute_path =
#################################### Generic OAuth #######################
[auth.generic_oauth]
name = OAuth
enabled = false
allow_sign_up = true
client_id = some_id
client_secret = some_secret
scopes = user:email
email_attribute_name = email:primary
email_attribute_path =
role_attribute_path =
auth_url =
token_url =
api_url =
allowed_domains =
team_ids =
allowed_organizations =
tls_skip_verify_insecure = false
tls_client_cert =
tls_client_key =
tls_client_ca =
#################################### Basic Auth ##########################
[auth.basic]
enabled = true
#################################### Auth Proxy ##########################
[auth.proxy]
enabled = false
header_name = X-WEBAUTH-USER
header_property = username
auto_sign_up = true
# Deprecated, use sync_ttl instead
ldap_sync_ttl = 60
sync_ttl = 60
whitelist =
headers =
enable_login_token = false
#################################### Auth LDAP ###########################
[auth.ldap]
enabled = false
config_file = /etc/grafana/ldap.toml
allow_sign_up = true
# LDAP backround sync (Enterprise only)
# At 1 am every day
sync_cron = "0 0 1 * * *"
active_sync_enabled = true
#################################### SMTP / Emailing #####################
[smtp]
enabled = true
host = smtp.tolina.local:25
user =
# If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;"""
password =
cert_file =
key_file =
skip_verify = true
from_address = admin@grafana-qa.smardigo.digital
from_name = smardigo
ehlo_identity =
[emails]
welcome_email_on_sign_up = false
templates_pattern = emails/*.html
#################################### Logging ##########################
[log]
# Either "console", "file", "syslog". Default is console and file
# Use space to separate multiple modes, e.g. "console file"
mode = console file
# Either "debug", "info", "warn", "error", "critical", default is "info"
level = info
# optional settings to set different levels for specific loggers. Ex filters = sqlstore:debug
filters =
# For "console" mode only
[log.console]
level =
# log line format, valid options are text, console and json
format = console
# For "file" mode only
[log.file]
level =
# log line format, valid options are text, console and json
format = text
# This enables automated log rotate(switch of following options), default is true
log_rotate = true
# Max line number of single file, default is 1000000
max_lines = 1000000
# Max size shift of single file, default is 28 means 1 << 28, 256MB
max_size_shift = 28
# Segment log daily, default is true
daily_rotate = true
# Expired days of log file(delete after max days), default is 7
max_days = 7
[log.syslog]
level =
# log line format, valid options are text, console and json
format = text
# Syslog network type and address. This can be udp, tcp, or unix. If left blank, the default unix endpoints will be used.
network =
address =
# Syslog facility. user, daemon and local0 through local7 are valid.
facility =
# Syslog tag. By default, the process' argv[0] is used.
tag =
#################################### Usage Quotas ########################
[quota]
enabled = false
#### set quotas to -1 to make unlimited. ####
# limit number of users per Org.
org_user = 10
# limit number of dashboards per Org.
org_dashboard = 100
# limit number of data_sources per Org.
org_data_source = 10
# limit number of api_keys per Org.
org_api_key = 10
# limit number of orgs a user can create.
user_org = 10
# Global limit of users.
global_user = -1
# global limit of orgs.
global_org = -1
# global limit of dashboards
global_dashboard = -1
# global limit of api_keys
global_api_key = -1
# global limit on number of logged in users.
global_session = -1
#################################### Alerting ############################
[alerting]
# Disable alerting engine & UI features
enabled = true
# Makes it possible to turn off alert rule execution but alerting UI is visible
execute_alerts = true
# Default setting for new alert rules. Defaults to categorize error and timeouts as alerting. (alerting, keep_state)
error_or_timeout = alerting
# Default setting for how Grafana handles nodata or null values in alerting. (alerting, no_data, keep_state, ok)
nodata_or_nullvalues = no_data
# Alert notifications can include images, but rendering many images at the same time can overload the server
# This limit will protect the server from render overloading and make sure notifications are sent out quickly
concurrent_render_limit = 5
# Default setting for alert calculation timeout. Default value is 30
evaluation_timeout_seconds = 30
# Default setting for alert notification timeout. Default value is 30
notification_timeout_seconds = 30
# Default setting for max attempts to sending alert notifications. Default value is 3
max_attempts = 3
# Makes it possible to enforce a minimal interval between evaluations, to reduce load on the backend
min_interval_seconds = 1
#################################### Explore #############################
[explore]
# Enable the Explore section
enabled = true
#################################### Internal Grafana Metrics ############
# Metrics available at HTTP API Url /metrics
[metrics]
enabled = true
interval_seconds = 10
# Disable total stats (stat_totals_*) metrics to be generated
disable_total_stats = false
#If both are set, basic auth will be required for the metrics endpoint.
basic_auth_username =
basic_auth_password =
# Send internal Grafana metrics to graphite
[metrics.graphite]
# Enable by setting the address setting (ex localhost:2003)
address =
prefix = prod.grafana.%(instance_name)s.
#################################### Grafana.com integration ##########################
[grafana_net]
url = https://grafana.com
[grafana_com]
url = https://grafana.com
#################################### Distributed tracing ############
[tracing.jaeger]
# jaeger destination (ex localhost:6831)
address =
# tag that will always be included in when creating new spans. ex (tag1:value1,tag2:value2)
always_included_tag =
# Type specifies the type of the sampler: const, probabilistic, rateLimiting, or remote
sampler_type = const
# jaeger samplerconfig param
# for "const" sampler, 0 or 1 for always false/true respectively
# for "probabilistic" sampler, a probability between 0 and 1
# for "rateLimiting" sampler, the number of spans per second
# for "remote" sampler, param is the same as for "probabilistic"
# and indicates the initial sampling rate before the actual one
# is received from the mothership
sampler_param = 1
# Whether or not to use Zipkin span propagation (x-b3- HTTP headers).
zipkin_propagation = false
# Setting this to true disables shared RPC spans.
# Not disabling is the most common setting when using Zipkin elsewhere in your infrastructure.
disable_shared_zipkin_spans = false
#################################### External Image Storage ##############
[external_image_storage]
# Used for uploading images to public servers so they can be included in slack/email messages.
# You can choose between (s3, webdav, gcs, azure_blob, local)
provider =
[external_image_storage.s3]
endpoint =
path_style_access =
bucket_url =
bucket =
region =
path =
access_key =
secret_key =
[external_image_storage.webdav]
url =
username =
password =
public_url =
[external_image_storage.gcs]
key_file =
bucket =
path =
[external_image_storage.azure_blob]
account_name =
account_key =
container_name =
[external_image_storage.local]
# does not require any configuration
[rendering]
# Options to configure a remote HTTP image rendering service, e.g. using https://github.com/grafana/grafana-image-renderer.
# URL to a remote HTTP image renderer service, e.g. http://localhost:8081/render, will enable Grafana to render panels and dashboards to PNG-images using HTTP requests to an external service.
server_url =
# If the remote HTTP image renderer service runs on a different server than the Grafana server you may have to configure this to a URL where Grafana is reachable, e.g. http://grafana.domain/.
callback_url =
# Concurrent render request limit affects when the /render HTTP endpoint is used. Rendering many images at the same time can overload the server,
# which this setting can help protect against by only allowing a certain amount of concurrent requests.
concurrent_render_request_limit = 30
[panels]
# here for to support old env variables, can remove after a few months
enable_alpha = false
disable_sanitize_html = false
[plugins]
enable_alpha = false
app_tls_skip_verify_insecure = false
# Enter a comma-separated list of plugin identifiers to identify plugins that are allowed to be loaded even if they lack a valid signature.
allow_loading_unsigned_plugins =
#################################### Grafana Image Renderer Plugin ##########################
[plugin.grafana-image-renderer]
# Instruct headless browser instance to use a default timezone when not provided by Grafana, e.g. when rendering panel image of alert.
# See ICUs metaZones.txt (https://cs.chromium.org/chromium/src/third_party/icu/source/data/misc/metaZones.txt) for a list of supported
# timezone IDs. Fallbacks to TZ environment variable if not set.
rendering_timezone =
# Instruct headless browser instance to use a default language when not provided by Grafana, e.g. when rendering panel image of alert.
# Please refer to the HTTP header Accept-Language to understand how to format this value, e.g. 'fr-CH, fr;q=0.9, en;q=0.8, de;q=0.7, *;q=0.5'.
rendering_language =
# Instruct headless browser instance to use a default device scale factor when not provided by Grafana, e.g. when rendering panel image of alert.
# Default is 1. Using a higher value will produce more detailed images (higher DPI), but will require more disk space to store an image.
rendering_viewport_device_scale_factor =
# Instruct headless browser instance whether to ignore HTTPS errors during navigation. Per default HTTPS errors are not ignored. Due to
# the security risk it's not recommended to ignore HTTPS errors.
rendering_ignore_https_errors =
# Instruct headless browser instance whether to capture and log verbose information when rendering an image. Default is false and will
# only capture and log error messages. When enabled, debug messages are captured and logged as well.
# For the verbose information to be included in the Grafana server log you have to adjust the rendering log level to debug, configure
# [log].filter = rendering:debug.
rendering_verbose_logging =
# Instruct headless browser instance whether to output its debug and error messages into running process of remote rendering service.
# Default is false. This can be useful to enable (true) when troubleshooting.
rendering_dumpio =
# Additional arguments to pass to the headless browser instance. Default is --no-sandbox. The list of Chromium flags can be found
# here (https://peter.sh/experiments/chromium-command-line-switches/). Multiple arguments is separated with comma-character.
rendering_args =
# You can configure the plugin to use a different browser binary instead of the pre-packaged version of Chromium.
# Please note that this is not recommended, since you may encounter problems if the installed version of Chrome/Chromium is not
# compatible with the plugin.
rendering_chrome_bin =
# Instruct how headless browser instances are created. Default is 'default' and will create a new browser instance on each request.
# Mode 'clustered' will make sure that only a maximum of browsers/incognito pages can execute concurrently.
# Mode 'reusable' will have one browser instance and will create a new incognito page on each request.
rendering_mode =
# When rendering_mode = clustered you can instruct how many browsers or incognito pages can execute concurrently. Default is 'browser'
# and will cluster using browser instances.
# Mode 'context' will cluster using incognito pages.
rendering_clustering_mode =
# When rendering_mode = clustered you can define maximum number of browser instances/incognito pages that can execute concurrently..
rendering_clustering_max_concurrency =
# Limit the maxiumum viewport width, height and device scale factor that can be requested.
rendering_viewport_max_width =
rendering_viewport_max_height =
rendering_viewport_max_device_scale_factor =
# Change the listening host and port of the gRPC server. Default host is 127.0.0.1 and default port is 0 and will automatically assign
# a port not in use.
grpc_host =
grpc_port =
[enterprise]
license_path =
[feature_toggles]
# enable features, separated by spaces
enable =

@ -0,0 +1,2 @@
GF_SECURITY_ADMIN_PASSWORD={{ grafana_admin_password }}
GF_USERS_ALLOW_SIGN_UP=false

@ -0,0 +1,11 @@
apiVersion: 1
providers:
- name: 'Prometheus'
orgId: 1
folder: ''
type: file
disableDeletion: false
editable: true
options:
path: /etc/grafana/provisioning/dashboards

@ -0,0 +1,50 @@
# config file version
apiVersion: 1
# list of datasources that should be deleted from the database
deleteDatasources:
- name: Prometheus
orgId: 1
# list of datasources to insert/update depending
# whats available in the database
datasources:
# <string, required> name of the datasource. Required
- name: Prometheus
# <string, required> datasource type. Required
type: prometheus
# <string, required> access mode. direct or proxy. Required
access: proxy
# <int> org id. will default to orgId 1 if not specified
orgId: 1
# <string> url
url: http://{{ service_name }}-prometheus:9090
# <string> database password, if used
password:
# <string> database user, if used
user:
# <string> database name, if used
database:
# <bool> enable/disable basic auth
basicAuth: true
# <string> basic auth username
basicAuthUser: admin
# <string> basic auth password
basicAuthPassword: foobar
# <bool> enable/disable with credentials headers
withCredentials:
# <bool> mark as default datasource. Max one per org
isDefault: true
# <map> fields that will be converted to json and stored in json_data
jsonData:
graphiteVersion: "1.1"
tlsAuth: false
tlsAuthWithCACert: false
# <string> json object of data that will be encrypted.
secureJsonData:
tlsCACert: "..."
tlsClientCert: "..."
tlsClientKey: "..."
version: 1
# <bool> allow users to edit datasources from the UI.
editable: true

@ -0,0 +1,252 @@
groups:
- name: tolina
rules:
- alert: too_many_connections
expr: pg_stat_database_numbackends > 20
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.datname {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.datname {{ '}}' }}> has too many open db connections"
description: "Too many connections for more than 2 minutes."
- alert: service_down
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> down"
description: "Down for more than 2 minutes."
- alert: high_load
expr: 100 - (avg by (instance, env, instance, job) (irate(node_cpu_seconds_total{env="{{ stage }}", mode="idle"}[30s])) * 100) > 90
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> under high load"
description: "High load for more than 2 minutes."
- alert: apt_upgrades_pending
expr: apt_upgrades_pending > 0
for: 1m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> has pending updates."
description: "Pending updates [{{ '{{' }} $labels.origin {{ '}}' }}] / [{{ '{{' }} $labels.arch {{ '}}' }}]."
- alert: reboot_required
expr: node_reboot_required == 1
for: 1m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs reboot."
description: "Need reboot!"
- alert: veeam_backup_failed
expr: veeam_backup_failed == 1
for: 1m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed."
description: "Backup failed."
- alert: probe_ssl_certificates
expr: probe_ssl_earliest_cert_expiry{job=~"http_check_200_secure|http_check_403_secure"} - time() < 86400 * 30
for: 2m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs a new certificate until next 30 days."
description: "needs a new certificate until next 30 days."
- alert: probe_ssl_certificates_50
expr: probe_ssl_earliest_cert_expiry{job="http_check_200_secure", instance="https://mail.sparkassenfinanzportal.de"} - time() < 86400 * 50
for: 2m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> needs a new certificate until next 50 days."
description: "needs a new certificate until next 50 days."
- alert: disk_space
expr: predict_linear(node_filesystem_avail_bytes{device!="/dev/loop1",device!="veeamagent",device!="/dev/veeamimage1",env="{{ stage }}",fstype!="cifs",mountpoint=~"/|/rootfs"}[1d], 24 * 3600 * 14) < 1024 * 1024 * 1024 * 5
for: 2m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> disk space falls below 5GB in the next 14 days."
description: "disk space falls below 5GB in the next 14 days."
- alert: software_raid_disks_active
expr: node_md_disks_active != 2
for: 2m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> software-raid disks broken."
description: "software-raid disks broken."
- alert: software_raid_active
expr: node_md_is_active != 1
for: 2m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> raid inactive."
description: "software-raid inactive."
- alert: restic_backup_failed
expr: (restic_cmd_return_code_mount + restic_cmd_return_code_backup + restic_cmd_return_code_forget + restic_cmd_return_code_prune + restic_cmd_return_code_check + restic_cmd_return_code_stats + restic_cmd_return_code_umount + restic_backup_failed) > 0
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> backup failed."
description: "backup failed."
- alert: megaraid_smart_errors
expr: megaraid_smart_errors > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> disk(s) reporting smart errors."
description: "megaraid smart errors."
- alert: megaraid_status_failed
expr: megaraid_status_failed > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting raid status failed."
description: "megaraid status errors."
- alert: megaraid_other_error_count
expr: megaraid_other_error_count > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting raid controller errors."
description: "megaraid other error count."
- alert: megaraid_exit_status
expr: megaraid_exit_status > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> megaraid exporter status failed."
description: "megaraid exit status."
- alert: adaptec_controller_defunct_disk_drives
expr: adaptec_controller_defunct_disk_drives > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting disk failure."
description: "adaptec controller defunct disk drives."
- alert: adaptec_physical_devices_smart_failed
expr: adaptec_physical_devices_smart_failed > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> physical devices reporting smart errors."
description: "adaptec physical devices smart failed."
- alert: adaptec_physical_devices_smart_warnings
expr: adaptec_physical_devices_smart_warnings > 1
for: 2m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> physical devices reporting smart warnings."
description: "adaptec physical devices smart warnings."
- alert: adaptec_controller_logical_failed
expr: adaptec_controller_logical_failed > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller logical failure."
description: "adaptec controller logical failed."
- alert: adaptec_controller_status_failed
expr: adaptec_controller_status_failed > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller status failure."
description: "adaptec controller status failed."
- alert: adaptec_controller_temperature_status_failed
expr: adaptec_controller_temperature_status_failed > 1
for: 2m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller temperatur status failure."
description: "adaptec controller temperature status failed"
- alert: adaptec_logical_degraded
expr: adaptec_logical_degraded > 1
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting controller logical degraded."
description: "adaptec logical degraded."
- alert: backup_execution
expr: (time() - restic_backup_timestamp) / 60 > 180
for: 2m
labels:
severity: critical
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting latest backup execution is older then 3 hours."
description: "latest backup is older then 90 minutes."
- alert: backup_duration
expr: (restic_cmd_duration_backup + restic_cmd_duration_check + restic_cmd_duration_forget + restic_cmd_duration_mount + restic_cmd_duration_prune + restic_cmd_duration_stats + restic_cmd_duration_umount) / 60 > 60
for: 2m
labels:
severity: warning
annotations:
identifier: '{{ '{{' }} $labels.instance {{ '}}' }}'
summary: "Instance <{{ '{{' }} $labels.instance {{ '}}' }}> reporting backup duration took more than 30 minutes."
description: "backup duration took more than 60 minutes."

@ -0,0 +1,8 @@
- targets:
{% for target in blackbox_exporter_targets_connect | default([]) %}
- {{ target }}
{% endfor %}
labels:
job: 'blackbox-exporter'
project: 'connect'
env: '{{ stage }}'

@ -0,0 +1,8 @@
- targets:
{% for target in blackbox_exporter_targets | default([]) %}
- {{ target }}
{% endfor %}
labels:
job: 'blackbox-exporter'
project: 'smardigo'
env: '{{ stage }}'

@ -0,0 +1,92 @@
# my global config
global:
scrape_interval: 15s # By default, scrape targets every 15 seconds.
evaluation_interval: 15s # By default, scrape targets every 15 seconds.
# scrape_timeout is set to the global default (10s).
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: '{{ stage_server_name }}'
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
rule_files:
- 'alert.rules'
# alert
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- "{{ service_name }}-alertmanager:9093"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
##############################################
### Apllications ####
##############################################
- job_name: 'prometheus'
static_configs:
- targets: [
'{{ service_name }}-prometheus:9090'
]
labels:
env: {{ stage }}
project: smardigo
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: '{{ service_prefix }}prometheus.{{ domain }}'
##############################################
- job_name: 'connect'
scheme: {{ http_s }}
metrics_path: '/management/prometheus'
static_configs:
- targets: [
{% for target in prometheus_targets_connect | default([]) %}
{{ target }}
{% endfor %}
{% for host in groups['connect'] | default([]) %}
'{{ host }}.{{ domain }}:{{ monitor_port_service }}',
{% endfor %}
]
labels:
env: {{ stage }}
project: smardigo
application: connect
relabel_configs:
- source_labels: [__address__]
regex: (.*):.*
target_label: instance
replacement: $1
##############################################
### Servers ####
##############################################
- job_name: 'node-exporter'
scheme: {{ http_s }}
metrics_path: '/metrics'
static_configs:
- targets: [
{% for host in groups['stage_dev'] | default([]) %}
'{{ host }}.{{ domain }}:{{ monitor_port_system }}',
{% endfor %}
]
labels:
env: {{ stage }}
project: smardigo
relabel_configs:
- source_labels: [job]
target_label: job
replacement: 'node-exporter'
- source_labels: [__address__]
regex: (.*):.*
target_label: instance
replacement: $1

@ -17,7 +17,7 @@
[http.routers.metrics] [http.routers.metrics]
rule = "Host(`{{ stage_server_name }}.{{ domain }}`) && Path(`/metrics`)" rule = "Host(`{{ stage_server_name }}.{{ domain }}`) && Path(`/metrics`)"
entrypoints = ["admin-service"] entrypoints = ["admin-service"]
middlewares = ["traefik-auth"] # middlewares = ["traefik-auth"]
service = "prometheus@internal" service = "prometheus@internal"
[http.routers.metrics.tls] [http.routers.metrics.tls]
certResolver = "letsencrypt" certResolver = "letsencrypt"

Loading…
Cancel
Save