From 5712a339ce7ed25410b45d7e439394f6e80945b9 Mon Sep 17 00:00:00 2001 From: Nemo Date: Thu, 28 Dec 2017 17:59:54 +0530 Subject: [PATCH 1/7] Initial work on monitoring - Using https://github.com/danguita/prometheus-monitoring-stack/blob/master/docker-compose.yml as base --- main.tf | 6 ++ monitoring/config/alert.rules | 19 +++++ monitoring/config/alertmanager.template.yml | 10 +++ monitoring/config/prometheus.yml | 17 +++++ monitoring/data.tf | 16 ++++ monitoring/images.tf | 18 +++++ monitoring/main.tf | 85 +++++++++++++++++++++ monitoring/variables.tf | 20 +++++ variables.tf | 4 + 9 files changed, 195 insertions(+) create mode 100644 monitoring/config/alert.rules create mode 100644 monitoring/config/alertmanager.template.yml create mode 100644 monitoring/config/prometheus.yml create mode 100644 monitoring/data.tf create mode 100644 monitoring/images.tf create mode 100644 monitoring/main.tf create mode 100644 monitoring/variables.tf diff --git a/main.tf b/main.tf index 9ad5848..312e21f 100644 --- a/main.tf +++ b/main.tf @@ -24,7 +24,13 @@ module "docker" { domain = "bb8.fun" } + module "radicale" { source = "radicale" domain = "radicale.bb8.fun" } + +module "monitoring" { + source = "monitoring" + gf-security-admin-password = "${var.gf-security-admin-password}" +} diff --git a/monitoring/config/alert.rules b/monitoring/config/alert.rules new file mode 100644 index 0000000..05be2cc --- /dev/null +++ b/monitoring/config/alert.rules @@ -0,0 +1,19 @@ +# Alert for any instance that is unreachable for >1 minute. +ALERT InstanceDown + IF up == 0 + FOR 1m + LABELS { severity = "page" } + ANNOTATIONS { + summary = "Instance {{ $labels.instance }} down", + description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.", + } + +# Alert for any instance that is under high load for >1 minute. +ALERT HighLoad + IF node_load1 > 0.85 + FOR 1m + LABELS { severity = "page" } + ANNOTATIONS { + summary = "Instance {{ $labels.instance }} is under high load", + description = "{{ $labels.instance }} of job {{ $labels.job }} has been under high load for more than 1 minute.", +} diff --git a/monitoring/config/alertmanager.template.yml b/monitoring/config/alertmanager.template.yml new file mode 100644 index 0000000..b8ee17c --- /dev/null +++ b/monitoring/config/alertmanager.template.yml @@ -0,0 +1,10 @@ +route: + receiver: "slack" + +receivers: + - name: "slack" + slack_configs: + - send_resolved: true + username: "ALERT_SLACK_USERNAME" + channel: "ALERT_SLACK_CHANNEL" +api_url: "ALERT_SLACK_INCOMING_WEBHOOK_URL" diff --git a/monitoring/config/prometheus.yml b/monitoring/config/prometheus.yml new file mode 100644 index 0000000..f531679 --- /dev/null +++ b/monitoring/config/prometheus.yml @@ -0,0 +1,17 @@ +global: + scrape_interval: 15s + external_labels: + monitor: "docker-monitor" + +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["prometheus:9090"] + + - job_name: "node" + scrape_interval: 5s + static_configs: + - targets: ["node_exporter:9100"] + +rule_files: +- "alert.rules" diff --git a/monitoring/data.tf b/monitoring/data.tf new file mode 100644 index 0000000..91f1c7c --- /dev/null +++ b/monitoring/data.tf @@ -0,0 +1,16 @@ +data "docker_registry_image" "grafana" { + name = "grafana/grafana" +} + +data "docker_registry_image" "prometheus" { + name = "prom/prometheus" +} + +data "docker_registry_image" "alertmanager" { + name = "prom/alertmanager" +} + +data "docker_registry_image" "nodeexporter" { + name = "prom/nodeexporter" +} + diff --git a/monitoring/images.tf b/monitoring/images.tf new file mode 100644 index 0000000..a5ad812 --- /dev/null +++ b/monitoring/images.tf @@ -0,0 +1,18 @@ +resource "docker_image" "grafana" { + name = "${data.docker_registry_image.grafana.name}" + pull_triggers = ["${data.docker_registry_image.grafana.sha256_digest}"] +} + +resource "docker_image" "prometheus" { + name = "${data.docker_registry_image.prometheus.name}" + pull_triggers = ["${data.docker_registry_image.prometheus.sha256_digest}"] +} + +resource "docker_image" "alertmanager" { + name = "${data.docker_registry_image.alertmanager.name}" + pull_triggers = ["${data.docker_registry_image.alertmanager.sha256_digest}"] +} +resource "docker_image" "nodeexporter" { + name = "${data.docker_registry_image.nodeexporter.name}" + pull_triggers = ["${data.docker_registry_image.nodeexporter.sha256_digest}"] +} diff --git a/monitoring/main.tf b/monitoring/main.tf new file mode 100644 index 0000000..8cbf0b4 --- /dev/null +++ b/monitoring/main.tf @@ -0,0 +1,85 @@ +resource docker_container "grafana" { + name = "grafana" + image = "${docker_image.grafana.latest}" + + labels { + # "traefik.frontend.auth.basic" = "${var.basic_auth}" + "traefik.port" = 3000 + "traefik.enable" = "true" + "traefik.frontend.headers.SSLTemporaryRedirect" = "true" + "traefik.frontend.headers.STSSeconds" = "2592000" + "traefik.frontend.headers.STSIncludeSubdomains" = "false" + "traefik.frontend.headers.contentTypeNosniff" = "true" + "traefik.frontend.headers.browserXSSFilter" = "true" + # "traefik.frontend.headers.customResponseHeaders" = "${var.xpoweredby}" + # "traefik.frontend.headers.customFrameOptionsValue" = "${var.xfo_allow}" + } + + volumes { + host_path = "/mnt/xwing/data/grafana" + container_path = "/var/lib/grafana" + } + + links = ["prometheus"] + + restart = "unless-stopped" + destroy_grace_seconds = 10 + must_run = true +} + +resource docker_container "prometheus" { + name = "prometheus" + image = "${docker_image.prometheus.latest}" + + command = ["-config.file=/etc/prometheus/prometheus.yml"] + + volumes { + host_path = "/mnt/xwing/data/prometheus" + container_path = "/prometheus" + } + + upload { + content = "${file("${path.module}/config/prometheus.yml")}" + file = "/etc/prometheus/prometheus.yml" + } + + links = ["nodeexporter"] + + restart = "unless-stopped" + destroy_grace_seconds = 10 + must_run = true +} + +resource docker_container "nodeexporter" { + name = "nodeexporter" + image = "${docker_image.nodeexporter.latest}" + + command = ["-config.file=/etc/prometheus/prometheus.yml"] + + volumes { + host_path = "/proc" + container_path = "/host/proc" + } + + volumes { + host_path = "/sys" + container_path = "/host/sys" + } + + volumes { + host_path = "/" + container_path = "/rootfs" + read_only = true + } + + command = [ + "-collector.procfs=/host/proc", + "-collector.sysfs=/host/sys", + "-collector.filesystem.ignored-mount-points=\"^/(sys|proc|dev|host|etc)($$|/)\"" + ] + + restart = "unless-stopped" + destroy_grace_seconds = 10 + must_run = true +} + diff --git a/monitoring/variables.tf b/monitoring/variables.tf new file mode 100644 index 0000000..6eac904 --- /dev/null +++ b/monitoring/variables.tf @@ -0,0 +1,20 @@ +variable "gf-security-admin-password" { + type = "string" +} +# variable "email" { +# type = "string" +# } + +# variable "domain" { +# type = "string" +# } + +variable "alert-slack-username" { + default = "Prometheus" +} +variable "alert-slack-channel" { + default = "#notifications" +} +variable "alert-slack-incoming-webhook" { + default = "https://hooks.slack.com/whatever" +} diff --git a/variables.tf b/variables.tf index 1028573..2524023 100644 --- a/variables.tf +++ b/variables.tf @@ -32,3 +32,7 @@ variable "ips" { static = "139.59.48.222" } } + +variable "gf-security-admin-password" { + type = "string" +} From e8bb8a01b2b58cc4112fc8739ae43e5a2e420460 Mon Sep 17 00:00:00 2001 From: Nemo Date: Thu, 28 Dec 2017 18:50:38 +0530 Subject: [PATCH 2/7] command line flag fixes --- monitoring/data.tf | 8 ++++---- monitoring/images.tf | 9 +++++---- monitoring/main.tf | 10 ++++------ 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/monitoring/data.tf b/monitoring/data.tf index 91f1c7c..b3010c6 100644 --- a/monitoring/data.tf +++ b/monitoring/data.tf @@ -6,11 +6,11 @@ data "docker_registry_image" "prometheus" { name = "prom/prometheus" } -data "docker_registry_image" "alertmanager" { - name = "prom/alertmanager" -} +# data "docker_registry_image" "alertmanager" { +# name = "prom/alertmanager" +# } data "docker_registry_image" "nodeexporter" { - name = "prom/nodeexporter" + name = "prom/node-exporter" } diff --git a/monitoring/images.tf b/monitoring/images.tf index a5ad812..2a7d60d 100644 --- a/monitoring/images.tf +++ b/monitoring/images.tf @@ -8,10 +8,11 @@ resource "docker_image" "prometheus" { pull_triggers = ["${data.docker_registry_image.prometheus.sha256_digest}"] } -resource "docker_image" "alertmanager" { - name = "${data.docker_registry_image.alertmanager.name}" - pull_triggers = ["${data.docker_registry_image.alertmanager.sha256_digest}"] -} +# resource "docker_image" "alertmanager" { +# name = "${data.docker_registry_image.alertmanager.name}" +# pull_triggers = ["${data.docker_registry_image.alertmanager.sha256_digest}"] +# } + resource "docker_image" "nodeexporter" { name = "${data.docker_registry_image.nodeexporter.name}" pull_triggers = ["${data.docker_registry_image.nodeexporter.sha256_digest}"] diff --git a/monitoring/main.tf b/monitoring/main.tf index 8cbf0b4..0d8e5b4 100644 --- a/monitoring/main.tf +++ b/monitoring/main.tf @@ -31,7 +31,7 @@ resource docker_container "prometheus" { name = "prometheus" image = "${docker_image.prometheus.latest}" - command = ["-config.file=/etc/prometheus/prometheus.yml"] + command = ["--config.file=/etc/prometheus/prometheus.yml"] volumes { host_path = "/mnt/xwing/data/prometheus" @@ -54,8 +54,6 @@ resource docker_container "nodeexporter" { name = "nodeexporter" image = "${docker_image.nodeexporter.latest}" - command = ["-config.file=/etc/prometheus/prometheus.yml"] - volumes { host_path = "/proc" container_path = "/host/proc" @@ -73,9 +71,9 @@ resource docker_container "nodeexporter" { } command = [ - "-collector.procfs=/host/proc", - "-collector.sysfs=/host/sys", - "-collector.filesystem.ignored-mount-points=\"^/(sys|proc|dev|host|etc)($$|/)\"" + "--collector.procfs=/host/proc", + "--collector.sysfs=/host/sys", + "--collector.filesystem.ignored-mount-points=\"^/(sys|proc|dev|host|etc)($$|/)\"" ] restart = "unless-stopped" From 26d9357793fb74d4521e97e6b671aaaff49ddd2e Mon Sep 17 00:00:00 2001 From: Nemo Date: Sat, 6 Jan 2018 13:51:03 +0530 Subject: [PATCH 3/7] Upgrade prometheus flags --- monitoring/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monitoring/main.tf b/monitoring/main.tf index 0d8e5b4..30f0045 100644 --- a/monitoring/main.tf +++ b/monitoring/main.tf @@ -71,8 +71,8 @@ resource docker_container "nodeexporter" { } command = [ - "--collector.procfs=/host/proc", - "--collector.sysfs=/host/sys", + "--path.procfs=/host/proc", + "--path.sysfs=/host/sys", "--collector.filesystem.ignored-mount-points=\"^/(sys|proc|dev|host|etc)($$|/)\"" ] From baf56f0edb11e7fd4093f733cf4ea9f9669df498 Mon Sep 17 00:00:00 2001 From: Nemo Date: Sat, 6 Jan 2018 14:39:49 +0530 Subject: [PATCH 4/7] Fixes nodeexporter --- main.tf | 4 ++-- monitoring/config/prometheus.yml | 7 ++++++- monitoring/data.tf | 1 - monitoring/main.tf | 28 ++++++++++++++++++---------- monitoring/variables.tf | 9 ++++++--- 5 files changed, 32 insertions(+), 17 deletions(-) diff --git a/main.tf b/main.tf index 312e21f..d94433b 100644 --- a/main.tf +++ b/main.tf @@ -24,13 +24,13 @@ module "docker" { domain = "bb8.fun" } - module "radicale" { source = "radicale" domain = "radicale.bb8.fun" } module "monitoring" { - source = "monitoring" + source = "monitoring" gf-security-admin-password = "${var.gf-security-admin-password}" + domain = "bb8.fun" } diff --git a/monitoring/config/prometheus.yml b/monitoring/config/prometheus.yml index f531679..85b0d8f 100644 --- a/monitoring/config/prometheus.yml +++ b/monitoring/config/prometheus.yml @@ -11,7 +11,12 @@ scrape_configs: - job_name: "node" scrape_interval: 5s static_configs: - - targets: ["node_exporter:9100"] + - targets: ["nodeexporter:9100"] + + - job_name: 'cadvisor' + scrape_interval: 5s + static_configs: + - targets: ['cadvisor:8080'] rule_files: - "alert.rules" diff --git a/monitoring/data.tf b/monitoring/data.tf index b3010c6..9a192aa 100644 --- a/monitoring/data.tf +++ b/monitoring/data.tf @@ -13,4 +13,3 @@ data "docker_registry_image" "prometheus" { data "docker_registry_image" "nodeexporter" { name = "prom/node-exporter" } - diff --git a/monitoring/main.tf b/monitoring/main.tf index 30f0045..9ed8ac5 100644 --- a/monitoring/main.tf +++ b/monitoring/main.tf @@ -4,13 +4,14 @@ resource docker_container "grafana" { labels { # "traefik.frontend.auth.basic" = "${var.basic_auth}" - "traefik.port" = 3000 - "traefik.enable" = "true" - "traefik.frontend.headers.SSLTemporaryRedirect" = "true" - "traefik.frontend.headers.STSSeconds" = "2592000" - "traefik.frontend.headers.STSIncludeSubdomains" = "false" - "traefik.frontend.headers.contentTypeNosniff" = "true" - "traefik.frontend.headers.browserXSSFilter" = "true" + "traefik.port" = 3000 + "traefik.enable" = "true" + "traefik.frontend.headers.SSLTemporaryRedirect" = "true" + "traefik.frontend.headers.STSSeconds" = "2592000" + "traefik.frontend.headers.STSIncludeSubdomains" = "false" + "traefik.frontend.headers.contentTypeNosniff" = "true" + "traefik.frontend.headers.browserXSSFilter" = "true" + # "traefik.frontend.headers.customResponseHeaders" = "${var.xpoweredby}" # "traefik.frontend.headers.customFrameOptionsValue" = "${var.xfo_allow}" } @@ -22,6 +23,11 @@ resource docker_container "grafana" { links = ["prometheus"] + env = [ + "GF_SECURITY_ADMIN_PASSWORD=${var.gf-security-admin-password}", + "GF_SERVER_ROOT_URL=https://grafana.${var.domain}", + ] + restart = "unless-stopped" destroy_grace_seconds = 10 must_run = true @@ -31,6 +37,9 @@ resource docker_container "prometheus" { name = "prometheus" image = "${docker_image.prometheus.latest}" + # prometheus:prometheus + user = "985:983" + command = ["--config.file=/etc/prometheus/prometheus.yml"] volumes { @@ -43,7 +52,7 @@ resource docker_container "prometheus" { file = "/etc/prometheus/prometheus.yml" } - links = ["nodeexporter"] + links = ["nodeexporter", "cadvisor"] restart = "unless-stopped" destroy_grace_seconds = 10 @@ -73,11 +82,10 @@ resource docker_container "nodeexporter" { command = [ "--path.procfs=/host/proc", "--path.sysfs=/host/sys", - "--collector.filesystem.ignored-mount-points=\"^/(sys|proc|dev|host|etc)($$|/)\"" + "--collector.filesystem.ignored-mount-points=\"^/(sys|proc|dev|host|etc)($$|/)\"", ] restart = "unless-stopped" destroy_grace_seconds = 10 must_run = true } - diff --git a/monitoring/variables.tf b/monitoring/variables.tf index 6eac904..5afcf1c 100644 --- a/monitoring/variables.tf +++ b/monitoring/variables.tf @@ -1,20 +1,23 @@ variable "gf-security-admin-password" { type = "string" } + # variable "email" { # type = "string" # } -# variable "domain" { -# type = "string" -# } +variable "domain" { + type = "string" +} variable "alert-slack-username" { default = "Prometheus" } + variable "alert-slack-channel" { default = "#notifications" } + variable "alert-slack-incoming-webhook" { default = "https://hooks.slack.com/whatever" } From e06fd4ab8ed0cb6d3f12c98ed0c9fad9bcde7e43 Mon Sep 17 00:00:00 2001 From: Nemo Date: Sat, 6 Jan 2018 15:55:39 +0530 Subject: [PATCH 5/7] Adds prometheus monitoring --- monitoring/config/prometheus.yml | 7 ++++++- monitoring/data.tf | 4 ++++ monitoring/images.tf | 10 +++++----- monitoring/transmission.tf | 16 ++++++++++++++++ monitoring/variables.tf | 4 ---- 5 files changed, 31 insertions(+), 10 deletions(-) create mode 100644 monitoring/transmission.tf diff --git a/monitoring/config/prometheus.yml b/monitoring/config/prometheus.yml index 85b0d8f..5cfe42a 100644 --- a/monitoring/config/prometheus.yml +++ b/monitoring/config/prometheus.yml @@ -6,7 +6,7 @@ global: scrape_configs: - job_name: "prometheus" static_configs: - - targets: ["prometheus:9090"] + - targets: ["localhost:9090"] - job_name: "node" scrape_interval: 5s @@ -18,5 +18,10 @@ scrape_configs: static_configs: - targets: ['cadvisor:8080'] + - job_name: 'transmission' + scrape_interval: 5s + static_configs: + - targets: ['transmission-exporter:19091'] + rule_files: - "alert.rules" diff --git a/monitoring/data.tf b/monitoring/data.tf index 9a192aa..9852676 100644 --- a/monitoring/data.tf +++ b/monitoring/data.tf @@ -13,3 +13,7 @@ data "docker_registry_image" "prometheus" { data "docker_registry_image" "nodeexporter" { name = "prom/node-exporter" } + +data "docker_registry_image" "transmission-exporter" { + name = "captn3m0/transmission-exporter" +} diff --git a/monitoring/images.tf b/monitoring/images.tf index 2a7d60d..46b903d 100644 --- a/monitoring/images.tf +++ b/monitoring/images.tf @@ -8,12 +8,12 @@ resource "docker_image" "prometheus" { pull_triggers = ["${data.docker_registry_image.prometheus.sha256_digest}"] } -# resource "docker_image" "alertmanager" { -# name = "${data.docker_registry_image.alertmanager.name}" -# pull_triggers = ["${data.docker_registry_image.alertmanager.sha256_digest}"] -# } - resource "docker_image" "nodeexporter" { name = "${data.docker_registry_image.nodeexporter.name}" pull_triggers = ["${data.docker_registry_image.nodeexporter.sha256_digest}"] } + +resource "docker_image" "transmission-exporter" { + name = "${data.docker_registry_image.transmission-exporter.name}" + pull_triggers = ["${data.docker_registry_image.transmission-exporter.sha256_digest}"] +} diff --git a/monitoring/transmission.tf b/monitoring/transmission.tf new file mode 100644 index 0000000..c3e0018 --- /dev/null +++ b/monitoring/transmission.tf @@ -0,0 +1,16 @@ +# Transmission Exporter for prometheus +# https://github.com/metalmatze/transmission-exporter +resource docker_container "transmission-exporter" { + name = "transmission-exporter" + image = "${docker_image.transmission-exporter.latest}" + + links = ["transmission"] + + env = [ + "TRANSMISSION_ADDR=http://transmission:9091" + ] + + restart = "unless-stopped" + destroy_grace_seconds = 10 + must_run = true +} diff --git a/monitoring/variables.tf b/monitoring/variables.tf index 5afcf1c..7f698e3 100644 --- a/monitoring/variables.tf +++ b/monitoring/variables.tf @@ -2,10 +2,6 @@ variable "gf-security-admin-password" { type = "string" } -# variable "email" { -# type = "string" -# } - variable "domain" { type = "string" } From cef86374c017acc61be862852d9ee67033565060 Mon Sep 17 00:00:00 2001 From: Nemo Date: Tue, 9 Jan 2018 22:30:36 +0530 Subject: [PATCH 6/7] Fix transmission-exporter --- monitoring/data.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monitoring/data.tf b/monitoring/data.tf index 9852676..344dcc3 100644 --- a/monitoring/data.tf +++ b/monitoring/data.tf @@ -15,5 +15,5 @@ data "docker_registry_image" "nodeexporter" { } data "docker_registry_image" "transmission-exporter" { - name = "captn3m0/transmission-exporter" + name = "metalmatze/transmission-exporter" } From 077302294cc3862d1d058b24ecd57f7b4e011484 Mon Sep 17 00:00:00 2001 From: Nemo Date: Tue, 9 Jan 2018 22:31:37 +0530 Subject: [PATCH 7/7] docs update on transmission-exporter PR --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1004586..32af9d3 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,7 @@ Issues I've faced/reported as a result of this project: 6. `ubooquity` docker container doesn't let you set admin password: https://github.com/linuxserver/docker-ubooquity/issues/17. (Couldn't reproduce, closed) :white_check_mark: 7. Traefik customresponseheaders can't contain colons on the docker backend: https://github.com/containous/traefik/issues/2517. Fixed with https://github.com/containous/traefik/pull/2509 :white_check_mark: 8. Traefik Security headers don't overwrite upstream headers: https://github.com/containous/traefik/issues/2618 +9. Transmission exporter broke with different data types while unmarshalling JSON in go. I filed a PR https://github.com/metalmatze/transmission-exporter/pull/2 # Plumbing