From 5712a339ce7ed25410b45d7e439394f6e80945b9 Mon Sep 17 00:00:00 2001
From: Nemo <me@captnemo.in>
Date: Thu, 28 Dec 2017 17:59:54 +0530
Subject: [PATCH] Initial work on monitoring

- Using
https://github.com/danguita/prometheus-monitoring-stack/blob/master/docker-compose.yml
as base
---
 main.tf                                     |  6 ++
 monitoring/config/alert.rules               | 19 +++++
 monitoring/config/alertmanager.template.yml | 10 +++
 monitoring/config/prometheus.yml            | 17 +++++
 monitoring/data.tf                          | 16 ++++
 monitoring/images.tf                        | 18 +++++
 monitoring/main.tf                          | 85 +++++++++++++++++++++
 monitoring/variables.tf                     | 20 +++++
 variables.tf                                |  4 +
 9 files changed, 195 insertions(+)
 create mode 100644 monitoring/config/alert.rules
 create mode 100644 monitoring/config/alertmanager.template.yml
 create mode 100644 monitoring/config/prometheus.yml
 create mode 100644 monitoring/data.tf
 create mode 100644 monitoring/images.tf
 create mode 100644 monitoring/main.tf
 create mode 100644 monitoring/variables.tf

diff --git a/main.tf b/main.tf
index 9ad5848..312e21f 100644
--- a/main.tf
+++ b/main.tf
@@ -24,7 +24,13 @@ module "docker" {
   domain              = "bb8.fun"
 }
 
+
 module "radicale" {
   source = "radicale"
   domain = "radicale.bb8.fun"
 }
+
+module "monitoring" {
+  source              = "monitoring"
+  gf-security-admin-password = "${var.gf-security-admin-password}"
+}
diff --git a/monitoring/config/alert.rules b/monitoring/config/alert.rules
new file mode 100644
index 0000000..05be2cc
--- /dev/null
+++ b/monitoring/config/alert.rules
@@ -0,0 +1,19 @@
+# Alert for any instance that is unreachable for >1 minute.
+ALERT InstanceDown
+  IF up == 0
+  FOR 1m
+  LABELS { severity = "page" }
+  ANNOTATIONS {
+    summary = "Instance {{ $labels.instance }} down",
+    description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.",
+  }
+
+# Alert for any instance that is under high load for >1 minute.
+ALERT HighLoad
+  IF node_load1 > 0.85
+  FOR 1m
+  LABELS { severity = "page" }
+  ANNOTATIONS {
+    summary = "Instance {{ $labels.instance }} is under high load",
+    description = "{{ $labels.instance }} of job {{ $labels.job }} has been under high load for more than 1 minute.",
+}
diff --git a/monitoring/config/alertmanager.template.yml b/monitoring/config/alertmanager.template.yml
new file mode 100644
index 0000000..b8ee17c
--- /dev/null
+++ b/monitoring/config/alertmanager.template.yml
@@ -0,0 +1,10 @@
+route:
+  receiver: "slack"
+
+receivers:
+  - name: "slack"
+    slack_configs:
+      - send_resolved: true
+        username: "ALERT_SLACK_USERNAME"
+        channel: "ALERT_SLACK_CHANNEL"
+api_url: "ALERT_SLACK_INCOMING_WEBHOOK_URL"
diff --git a/monitoring/config/prometheus.yml b/monitoring/config/prometheus.yml
new file mode 100644
index 0000000..f531679
--- /dev/null
+++ b/monitoring/config/prometheus.yml
@@ -0,0 +1,17 @@
+global:
+  scrape_interval: 15s
+  external_labels:
+    monitor: "docker-monitor"
+
+scrape_configs:
+  - job_name: "prometheus"
+    static_configs:
+      - targets: ["prometheus:9090"]
+
+  - job_name: "node"
+    scrape_interval: 5s
+    static_configs:
+      - targets: ["node_exporter:9100"]
+
+rule_files:
+- "alert.rules"
diff --git a/monitoring/data.tf b/monitoring/data.tf
new file mode 100644
index 0000000..91f1c7c
--- /dev/null
+++ b/monitoring/data.tf
@@ -0,0 +1,16 @@
+data "docker_registry_image" "grafana" {
+  name = "grafana/grafana"
+}
+
+data "docker_registry_image" "prometheus" {
+  name = "prom/prometheus"
+}
+
+data "docker_registry_image" "alertmanager" {
+  name = "prom/alertmanager"
+}
+
+data "docker_registry_image" "nodeexporter" {
+  name = "prom/nodeexporter"
+}
+
diff --git a/monitoring/images.tf b/monitoring/images.tf
new file mode 100644
index 0000000..a5ad812
--- /dev/null
+++ b/monitoring/images.tf
@@ -0,0 +1,18 @@
+resource "docker_image" "grafana" {
+  name          = "${data.docker_registry_image.grafana.name}"
+  pull_triggers = ["${data.docker_registry_image.grafana.sha256_digest}"]
+}
+
+resource "docker_image" "prometheus" {
+  name          = "${data.docker_registry_image.prometheus.name}"
+  pull_triggers = ["${data.docker_registry_image.prometheus.sha256_digest}"]
+}
+
+resource "docker_image" "alertmanager" {
+  name          = "${data.docker_registry_image.alertmanager.name}"
+  pull_triggers = ["${data.docker_registry_image.alertmanager.sha256_digest}"]
+}
+resource "docker_image" "nodeexporter" {
+  name          = "${data.docker_registry_image.nodeexporter.name}"
+  pull_triggers = ["${data.docker_registry_image.nodeexporter.sha256_digest}"]
+}
diff --git a/monitoring/main.tf b/monitoring/main.tf
new file mode 100644
index 0000000..8cbf0b4
--- /dev/null
+++ b/monitoring/main.tf
@@ -0,0 +1,85 @@
+resource docker_container "grafana" {
+  name  = "grafana"
+  image = "${docker_image.grafana.latest}"
+
+  labels {
+    # "traefik.frontend.auth.basic"                      = "${var.basic_auth}"
+    "traefik.port"                                     = 3000
+    "traefik.enable"                                   = "true"
+    "traefik.frontend.headers.SSLTemporaryRedirect"    = "true"
+    "traefik.frontend.headers.STSSeconds"              = "2592000"
+    "traefik.frontend.headers.STSIncludeSubdomains"    = "false"
+    "traefik.frontend.headers.contentTypeNosniff"      = "true"
+    "traefik.frontend.headers.browserXSSFilter"        = "true"
+    # "traefik.frontend.headers.customResponseHeaders"   = "${var.xpoweredby}"
+    # "traefik.frontend.headers.customFrameOptionsValue" = "${var.xfo_allow}"
+  }
+
+  volumes {
+    host_path      = "/mnt/xwing/data/grafana"
+    container_path = "/var/lib/grafana"
+  }
+
+  links = ["prometheus"]
+
+  restart               = "unless-stopped"
+  destroy_grace_seconds = 10
+  must_run              = true
+}
+
+resource docker_container "prometheus" {
+  name  = "prometheus"
+  image = "${docker_image.prometheus.latest}"
+
+  command = ["-config.file=/etc/prometheus/prometheus.yml"]
+
+  volumes {
+    host_path      = "/mnt/xwing/data/prometheus"
+    container_path = "/prometheus"
+  }
+
+  upload {
+    content = "${file("${path.module}/config/prometheus.yml")}"
+    file    = "/etc/prometheus/prometheus.yml"
+  }
+
+  links = ["nodeexporter"]
+
+  restart               = "unless-stopped"
+  destroy_grace_seconds = 10
+  must_run              = true
+}
+
+resource docker_container "nodeexporter" {
+  name  = "nodeexporter"
+  image = "${docker_image.nodeexporter.latest}"
+
+  command = ["-config.file=/etc/prometheus/prometheus.yml"]
+
+  volumes {
+    host_path      = "/proc"
+    container_path = "/host/proc"
+  }
+
+  volumes {
+    host_path      = "/sys"
+    container_path = "/host/sys"
+  }
+
+  volumes {
+    host_path      = "/"
+    container_path = "/rootfs"
+    read_only      = true
+  }
+
+  command = [
+    "-collector.procfs=/host/proc",
+    "-collector.sysfs=/host/sys",
+    "-collector.filesystem.ignored-mount-points=\"^/(sys|proc|dev|host|etc)($$|/)\""
+  ]
+
+  restart               = "unless-stopped"
+  destroy_grace_seconds = 10
+  must_run              = true
+}
+
diff --git a/monitoring/variables.tf b/monitoring/variables.tf
new file mode 100644
index 0000000..6eac904
--- /dev/null
+++ b/monitoring/variables.tf
@@ -0,0 +1,20 @@
+variable "gf-security-admin-password" {
+  type = "string"
+}
+# variable "email" {
+#   type = "string"
+# }
+
+# variable "domain" {
+#   type = "string"
+# }
+
+variable "alert-slack-username" {
+  default = "Prometheus"
+}
+variable "alert-slack-channel" {
+  default = "#notifications"
+}
+variable "alert-slack-incoming-webhook" {
+  default = "https://hooks.slack.com/whatever"
+}
diff --git a/variables.tf b/variables.tf
index 1028573..2524023 100644
--- a/variables.tf
+++ b/variables.tf
@@ -32,3 +32,7 @@ variable "ips" {
     static = "139.59.48.222"
   }
 }
+
+variable "gf-security-admin-password" {
+  type = "string"
+}