From 5712a339ce7ed25410b45d7e439394f6e80945b9 Mon Sep 17 00:00:00 2001 From: Nemo Date: Thu, 28 Dec 2017 17:59:54 +0530 Subject: [PATCH] Initial work on monitoring - Using https://github.com/danguita/prometheus-monitoring-stack/blob/master/docker-compose.yml as base --- main.tf | 6 ++ monitoring/config/alert.rules | 19 +++++ monitoring/config/alertmanager.template.yml | 10 +++ monitoring/config/prometheus.yml | 17 +++++ monitoring/data.tf | 16 ++++ monitoring/images.tf | 18 +++++ monitoring/main.tf | 85 +++++++++++++++++++++ monitoring/variables.tf | 20 +++++ variables.tf | 4 + 9 files changed, 195 insertions(+) create mode 100644 monitoring/config/alert.rules create mode 100644 monitoring/config/alertmanager.template.yml create mode 100644 monitoring/config/prometheus.yml create mode 100644 monitoring/data.tf create mode 100644 monitoring/images.tf create mode 100644 monitoring/main.tf create mode 100644 monitoring/variables.tf diff --git a/main.tf b/main.tf index 9ad5848..312e21f 100644 --- a/main.tf +++ b/main.tf @@ -24,7 +24,13 @@ module "docker" { domain = "bb8.fun" } + module "radicale" { source = "radicale" domain = "radicale.bb8.fun" } + +module "monitoring" { + source = "monitoring" + gf-security-admin-password = "${var.gf-security-admin-password}" +} diff --git a/monitoring/config/alert.rules b/monitoring/config/alert.rules new file mode 100644 index 0000000..05be2cc --- /dev/null +++ b/monitoring/config/alert.rules @@ -0,0 +1,19 @@ +# Alert for any instance that is unreachable for >1 minute. +ALERT InstanceDown + IF up == 0 + FOR 1m + LABELS { severity = "page" } + ANNOTATIONS { + summary = "Instance {{ $labels.instance }} down", + description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.", + } + +# Alert for any instance that is under high load for >1 minute. +ALERT HighLoad + IF node_load1 > 0.85 + FOR 1m + LABELS { severity = "page" } + ANNOTATIONS { + summary = "Instance {{ $labels.instance }} is under high load", + description = "{{ $labels.instance }} of job {{ $labels.job }} has been under high load for more than 1 minute.", +} diff --git a/monitoring/config/alertmanager.template.yml b/monitoring/config/alertmanager.template.yml new file mode 100644 index 0000000..b8ee17c --- /dev/null +++ b/monitoring/config/alertmanager.template.yml @@ -0,0 +1,10 @@ +route: + receiver: "slack" + +receivers: + - name: "slack" + slack_configs: + - send_resolved: true + username: "ALERT_SLACK_USERNAME" + channel: "ALERT_SLACK_CHANNEL" +api_url: "ALERT_SLACK_INCOMING_WEBHOOK_URL" diff --git a/monitoring/config/prometheus.yml b/monitoring/config/prometheus.yml new file mode 100644 index 0000000..f531679 --- /dev/null +++ b/monitoring/config/prometheus.yml @@ -0,0 +1,17 @@ +global: + scrape_interval: 15s + external_labels: + monitor: "docker-monitor" + +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["prometheus:9090"] + + - job_name: "node" + scrape_interval: 5s + static_configs: + - targets: ["node_exporter:9100"] + +rule_files: +- "alert.rules" diff --git a/monitoring/data.tf b/monitoring/data.tf new file mode 100644 index 0000000..91f1c7c --- /dev/null +++ b/monitoring/data.tf @@ -0,0 +1,16 @@ +data "docker_registry_image" "grafana" { + name = "grafana/grafana" +} + +data "docker_registry_image" "prometheus" { + name = "prom/prometheus" +} + +data "docker_registry_image" "alertmanager" { + name = "prom/alertmanager" +} + +data "docker_registry_image" "nodeexporter" { + name = "prom/nodeexporter" +} + diff --git a/monitoring/images.tf b/monitoring/images.tf new file mode 100644 index 0000000..a5ad812 --- /dev/null +++ b/monitoring/images.tf @@ -0,0 +1,18 @@ +resource "docker_image" "grafana" { + name = "${data.docker_registry_image.grafana.name}" + pull_triggers = ["${data.docker_registry_image.grafana.sha256_digest}"] +} + +resource "docker_image" "prometheus" { + name = "${data.docker_registry_image.prometheus.name}" + pull_triggers = ["${data.docker_registry_image.prometheus.sha256_digest}"] +} + +resource "docker_image" "alertmanager" { + name = "${data.docker_registry_image.alertmanager.name}" + pull_triggers = ["${data.docker_registry_image.alertmanager.sha256_digest}"] +} +resource "docker_image" "nodeexporter" { + name = "${data.docker_registry_image.nodeexporter.name}" + pull_triggers = ["${data.docker_registry_image.nodeexporter.sha256_digest}"] +} diff --git a/monitoring/main.tf b/monitoring/main.tf new file mode 100644 index 0000000..8cbf0b4 --- /dev/null +++ b/monitoring/main.tf @@ -0,0 +1,85 @@ +resource docker_container "grafana" { + name = "grafana" + image = "${docker_image.grafana.latest}" + + labels { + # "traefik.frontend.auth.basic" = "${var.basic_auth}" + "traefik.port" = 3000 + "traefik.enable" = "true" + "traefik.frontend.headers.SSLTemporaryRedirect" = "true" + "traefik.frontend.headers.STSSeconds" = "2592000" + "traefik.frontend.headers.STSIncludeSubdomains" = "false" + "traefik.frontend.headers.contentTypeNosniff" = "true" + "traefik.frontend.headers.browserXSSFilter" = "true" + # "traefik.frontend.headers.customResponseHeaders" = "${var.xpoweredby}" + # "traefik.frontend.headers.customFrameOptionsValue" = "${var.xfo_allow}" + } + + volumes { + host_path = "/mnt/xwing/data/grafana" + container_path = "/var/lib/grafana" + } + + links = ["prometheus"] + + restart = "unless-stopped" + destroy_grace_seconds = 10 + must_run = true +} + +resource docker_container "prometheus" { + name = "prometheus" + image = "${docker_image.prometheus.latest}" + + command = ["-config.file=/etc/prometheus/prometheus.yml"] + + volumes { + host_path = "/mnt/xwing/data/prometheus" + container_path = "/prometheus" + } + + upload { + content = "${file("${path.module}/config/prometheus.yml")}" + file = "/etc/prometheus/prometheus.yml" + } + + links = ["nodeexporter"] + + restart = "unless-stopped" + destroy_grace_seconds = 10 + must_run = true +} + +resource docker_container "nodeexporter" { + name = "nodeexporter" + image = "${docker_image.nodeexporter.latest}" + + command = ["-config.file=/etc/prometheus/prometheus.yml"] + + volumes { + host_path = "/proc" + container_path = "/host/proc" + } + + volumes { + host_path = "/sys" + container_path = "/host/sys" + } + + volumes { + host_path = "/" + container_path = "/rootfs" + read_only = true + } + + command = [ + "-collector.procfs=/host/proc", + "-collector.sysfs=/host/sys", + "-collector.filesystem.ignored-mount-points=\"^/(sys|proc|dev|host|etc)($$|/)\"" + ] + + restart = "unless-stopped" + destroy_grace_seconds = 10 + must_run = true +} + diff --git a/monitoring/variables.tf b/monitoring/variables.tf new file mode 100644 index 0000000..6eac904 --- /dev/null +++ b/monitoring/variables.tf @@ -0,0 +1,20 @@ +variable "gf-security-admin-password" { + type = "string" +} +# variable "email" { +# type = "string" +# } + +# variable "domain" { +# type = "string" +# } + +variable "alert-slack-username" { + default = "Prometheus" +} +variable "alert-slack-channel" { + default = "#notifications" +} +variable "alert-slack-incoming-webhook" { + default = "https://hooks.slack.com/whatever" +} diff --git a/variables.tf b/variables.tf index 1028573..2524023 100644 --- a/variables.tf +++ b/variables.tf @@ -32,3 +32,7 @@ variable "ips" { static = "139.59.48.222" } } + +variable "gf-security-admin-password" { + type = "string" +}