diff --git a/group_vars/all/plain.yml b/group_vars/all/plain.yml index 1750a2c..e03fbf9 100644 --- a/group_vars/all/plain.yml +++ b/group_vars/all/plain.yml @@ -1,4 +1,5 @@ --- +ansible_ssh_host: "{{ stage_server_domain }}" debug: false ssh_macs: @@ -230,5 +231,3 @@ prometheus_alert_pg_replication_lag: 120 upstream_dns_servers: - 185.12.64.1 - 185.12.64.2 - - diff --git a/restart_node_exp.yaml b/restart_node_exp.yaml new file mode 100644 index 0000000..8bc8f31 --- /dev/null +++ b/restart_node_exp.yaml @@ -0,0 +1,12 @@ +- hosts: all + gather_facts: no + vars: + ansible_ssh_host: "{{ stage_server_domain }}" + remote_user: friedrich.goerz + strategy: free + tasks: + - name: restart node-exporter + become: yes + ansible.builtin.systemd: + name: prometheus-node-exporter + state: restarted diff --git a/roles/common/handlers/main.yml b/roles/common/handlers/main.yml index 0ba3dff..0faf08a 100644 --- a/roles/common/handlers/main.yml +++ b/roles/common/handlers/main.yml @@ -5,8 +5,11 @@ name=ntpd state=restarted - - name: restart ssh service: name=sshd state=restarted + +- name: "Regenerate grub config" + become: yes + command: "/usr/sbin/update-grub" diff --git a/roles/common/tasks/DEV_701_kernel_fix.yaml b/roles/common/tasks/DEV_701_kernel_fix.yaml new file mode 100644 index 0000000..ee65811 --- /dev/null +++ b/roles/common/tasks/DEV_701_kernel_fix.yaml @@ -0,0 +1,18 @@ +- name: "Ensure needed kernel packages for fix" + become: yes + apt: + pkg: + - linux-image-5.4.0-131-generic + - linux-modules-5.4.0-131-generic + tags: + - kernel-fix +- name: "Ensure kernel version in GRUB" + become: yes + lineinfile: + state: present + regex: "^GRUB_DEFAULT=" + line: 'GRUB_DEFAULT="Advanced options for Ubuntu>Ubuntu, with Linux 5.4.0-131-generic"' + path: /etc/default/grub + notify: "Regenerate grub config" + tags: + - kernel-fix diff --git a/roles/common/tasks/main.yml b/roles/common/tasks/main.yml index b287053..1e5b1d1 100644 --- a/roles/common/tasks/main.yml +++ b/roles/common/tasks/main.yml @@ -280,3 +280,9 @@ - ssh_hardening tags: - ssh_hardening + +# DEV-701 +- name: "Fixing flappig node-exporter due to hazzle with ne kernel(s)" + include_tasks: DEV_701_kernel_fix.yaml + tags: + - kernel-fix diff --git a/roles/node_exporter/files/default_config b/roles/node_exporter/files/default_config new file mode 100644 index 0000000..99456ce --- /dev/null +++ b/roles/node_exporter/files/default_config @@ -0,0 +1,128 @@ +# Set the command-line arguments to pass to the server. +# Due to shell scaping, to pass backslashes for regexes, you need to double +# them (\\d for \d). If running under systemd, you need to double them again +# (\\\\d to mean \d), and escape newlines too. +ARGS="--web.listen-address='127.0.0.1:9082'" + +# Prometheus-node-exporter supports the following options: +# +# --collector.diskstats.ignored-devices="^(ram|loop|fd|(h|s|v|xv)d[a-z]|nvme\\d+n\\d+p)\\d+$" +# Regexp of devices to ignore for diskstats. +# --collector.filesystem.ignored-mount-points="^/(dev|proc|run|sys|mnt|media|var/lib/docker)($|/)" +# Regexp of mount points to ignore for filesystem +# collector. +# --collector.filesystem.ignored-fs-types="^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$" +# Regexp of filesystem types to ignore for +# filesystem collector. +# --collector.netdev.ignored-devices="^lo$" +# Regexp of net devices to ignore for netdev +# collector. +# --collector.netstat.fields="^(.*_(InErrors|InErrs)|Ip_Forwarding|Ip(6|Ext)_(InOctets|OutOctets)|Icmp6?_(InMsgs|OutMsgs)|TcpExt_(Listen.*|Syncookies.*)|Tcp_(ActiveOpens|PassiveOpens|RetransSegs|CurrEstab)|Udp6?_(InDatagrams|OutDatagrams|NoPorts))$" +# Regexp of fields to return for netstat +# collector. +# --collector.ntp.server="127.0.0.1" +# NTP server to use for ntp collector +# --collector.ntp.protocol-version=4 +# NTP protocol version +# --collector.ntp.server-is-local +# Certify that collector.ntp.server address is the +# same local host as this collector. +# --collector.ntp.ip-ttl=1 IP TTL to use while sending NTP query +# --collector.ntp.max-distance=3.46608s +# Max accumulated distance to the root +# --collector.ntp.local-offset-tolerance=1ms +# Offset between local clock and local ntpd time +# to tolerate +# --path.procfs="/proc" procfs mountpoint. +# --path.sysfs="/sys" sysfs mountpoint. +# --collector.qdisc.fixtures="" +# test fixtures to use for qdisc collector +# end-to-end testing +# --collector.runit.servicedir="/etc/service" +# Path to runit service directory. +# --collector.supervisord.url="http://localhost:9001/RPC2" +# XML RPC endpoint. +# --collector.systemd.unit-whitelist=".+" +# Regexp of systemd units to whitelist. Units must +# both match whitelist and not match blacklist to +# be included. +# --collector.systemd.unit-blacklist=".+(\\.device|\\.scope|\\.slice|\\.target)" +# Regexp of systemd units to blacklist. Units must +# both match whitelist and not match blacklist to +# be included. +# --collector.systemd.private +# Establish a private, direct connection to +# systemd without dbus. +# --collector.textfile.directory="/var/lib/prometheus/node-exporter" +# Directory to read text files with metrics from. +# --collector.vmstat.fields="^(oom_kill|pgpg|pswp|pg.*fault).*" +# Regexp of fields to return for vmstat collector. +# --collector.wifi.fixtures="" +# test fixtures to use for wifi collector metrics +# --collector.arp Enable the arp collector (default: enabled). +# --collector.bcache Enable the bcache collector (default: enabled). +# --collector.bonding Enable the bonding collector (default: enabled). +# --collector.buddyinfo Enable the buddyinfo collector (default: +# disabled). +# --collector.conntrack Enable the conntrack collector (default: +# enabled). +# --collector.cpu Enable the cpu collector (default: enabled). +# --collector.diskstats Enable the diskstats collector (default: +# enabled). +# --collector.drbd Enable the drbd collector (default: disabled). +# --collector.edac Enable the edac collector (default: enabled). +# --collector.entropy Enable the entropy collector (default: enabled). +# --collector.filefd Enable the filefd collector (default: enabled). +# --collector.filesystem Enable the filesystem collector (default: +# enabled). +# --collector.hwmon Enable the hwmon collector (default: enabled). +# --collector.infiniband Enable the infiniband collector (default: +# enabled). +# --collector.interrupts Enable the interrupts collector (default: +# disabled). +# --collector.ipvs Enable the ipvs collector (default: enabled). +# --collector.ksmd Enable the ksmd collector (default: disabled). +# --collector.loadavg Enable the loadavg collector (default: enabled). +# --collector.logind Enable the logind collector (default: disabled). +# --collector.mdadm Enable the mdadm collector (default: enabled). +# --collector.meminfo Enable the meminfo collector (default: enabled). +# --collector.meminfo_numa Enable the meminfo_numa collector (default: +# disabled). +# --collector.mountstats Enable the mountstats collector (default: +# disabled). +# --collector.netdev Enable the netdev collector (default: enabled). +# --collector.netstat Enable the netstat collector (default: enabled). +# --collector.nfs Enable the nfs collector (default: enabled). +# --collector.nfsd Enable the nfsd collector (default: enabled). +# --collector.ntp Enable the ntp collector (default: disabled). +# --collector.qdisc Enable the qdisc collector (default: disabled). +# --collector.runit Enable the runit collector (default: disabled). +# --collector.sockstat Enable the sockstat collector (default: +# enabled). +# --collector.stat Enable the stat collector (default: enabled). +# --collector.supervisord Enable the supervisord collector (default: +# disabled). +# --collector.systemd Enable the systemd collector (default: enabled). +# --collector.tcpstat Enable the tcpstat collector (default: +# disabled). +# --collector.textfile Enable the textfile collector (default: +# enabled). +# --collector.time Enable the time collector (default: enabled). +# --collector.uname Enable the uname collector (default: enabled). +# --collector.vmstat Enable the vmstat collector (default: enabled). +# --collector.wifi Enable the wifi collector (default: enabled). +# --collector.xfs Enable the xfs collector (default: enabled). +# --collector.zfs Enable the zfs collector (default: enabled). +# --collector.timex Enable the timex collector (default: enabled). +# --web.listen-address=":9100" +# Address on which to expose metrics and web +# interface. +# --web.telemetry-path="/metrics" +# Path under which to expose metrics. +# --log.level="info" Only log messages with the given severity or +# above. Valid levels: [debug, info, warn, error, +# fatal] +# --log.format="logger:stderr" +# Set the log target and format. Example: +# "logger:syslog?appname=bob&local=7" or +# "logger:stdout?json=true" diff --git a/roles/node_exporter/tasks/main.yml b/roles/node_exporter/tasks/main.yml index 58c7871..f4e70fa 100644 --- a/roles/node_exporter/tasks/main.yml +++ b/roles/node_exporter/tasks/main.yml @@ -8,6 +8,21 @@ - prometheus-node-exporter - prometheus-node-exporter-collectors +- name: "checking for default config" + stat: + path: /etc/default/prometheus-node-exporter + register: default_config + +- name: "providing default config" + copy: + src: default_config + dest: /etc/default/prometheus-node-exporter + owner: root + group: root + mode: '0644' + when: + - not default_config.stat.exists + - name: "Setup prometheus-node-exporter interface bind" lineinfile: path: /etc/default/prometheus-node-exporter