Skip to content

Instantly share code, notes, and snippets.

@torumakabe
Last active May 14, 2023 18:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save torumakabe/e7787218eee07a003143849d0855ae59 to your computer and use it in GitHub Desktop.
Save torumakabe/e7787218eee07a003143849d0855ae59 to your computer and use it in GitHub Desktop.
Azure Kubernetes Serviceの推奨メトリックアラートを設定するTerraform HCLのサンプル
terraform {
required_version = ">= 0.13.4"
}
provider "azurerm" {
version = "~>2.30"
features {}
}
resource "azurerm_resource_group" "sample" {
name = var.aks_rg_name
location = var.aks_rg_location
}
data "azurerm_log_analytics_workspace" "sample" {
name = var.la_workspace_name
resource_group_name = var.la_workspace_rg_name
}
resource "azurerm_kubernetes_cluster" "sample" {
name = var.aks_cluster_name
kubernetes_version = "1.18.8"
location = azurerm_resource_group.sample.location
resource_group_name = azurerm_resource_group.sample.name
dns_prefix = var.aks_cluster_name
default_node_pool {
name = "default"
type = "VirtualMachineScaleSets"
availability_zones = [1, 2, 3]
node_count = 3
vm_size = "Standard_F2s_v2"
}
identity {
type = "SystemAssigned"
}
addon_profile {
oms_agent {
enabled = true
log_analytics_workspace_id = data.azurerm_log_analytics_workspace.sample.id
}
kube_dashboard {
enabled = false
}
}
}
resource "azurerm_role_assignment" "aks_metrics" {
scope = azurerm_kubernetes_cluster.sample.id
role_definition_name = "Monitoring Metrics Publisher"
principal_id = azurerm_kubernetes_cluster.sample.addon_profile.0.oms_agent.0.oms_agent_identity.0.object_id
}
resource "azurerm_monitor_diagnostic_setting" "aks" {
name = "aks-diag"
target_resource_id = azurerm_kubernetes_cluster.sample.id
log_analytics_workspace_id = data.azurerm_log_analytics_workspace.sample.id
log {
category = "kube-apiserver"
enabled = true
retention_policy {
days = 0
enabled = false
}
}
log {
category = "kube-controller-manager"
enabled = true
retention_policy {
days = 0
enabled = false
}
}
log {
category = "kube-scheduler"
enabled = true
retention_policy {
days = 0
enabled = false
}
}
log {
category = "kube-audit"
enabled = false
retention_policy {
days = 0
enabled = false
}
}
log {
category = "kube-audit-admin"
enabled = true
retention_policy {
days = 0
enabled = false
}
}
log {
category = "guard"
enabled = true
retention_policy {
days = 0
enabled = false
}
}
log {
category = "cluster-autoscaler"
enabled = true
retention_policy {
days = 0
enabled = false
}
}
metric {
category = "AllMetrics"
enabled = false
retention_policy {
days = 0
enabled = false
}
}
}
provider "kubernetes" {
version = "~>1.13"
load_config_file = false
host = azurerm_kubernetes_cluster.sample.kube_config.0.host
client_certificate = base64decode(azurerm_kubernetes_cluster.sample.kube_config.0.client_certificate)
client_key = base64decode(azurerm_kubernetes_cluster.sample.kube_config.0.client_key)
cluster_ca_certificate = base64decode(azurerm_kubernetes_cluster.sample.kube_config.0.cluster_ca_certificate)
}
resource "kubernetes_cluster_role" "log_reader" {
metadata {
name = "containerhealth-log-reader"
}
rule {
api_groups = ["", "metrics.k8s.io", "extensions", "apps"]
resources = ["pods/log", "events", "nodes", "pods", "deployments", "replicasets"]
verbs = ["get", "list"]
}
}
resource "kubernetes_cluster_role_binding" "log_reader" {
metadata {
name = "containerhealth-read-logs-global"
}
role_ref {
kind = "ClusterRole"
name = "containerhealth-log-reader"
api_group = "rbac.authorization.k8s.io"
}
subject {
kind = "User"
name = "clusterUser"
api_group = "rbac.authorization.k8s.io"
}
}
resource "kubernetes_config_map" "oms_agent" {
depends_on = [azurerm_role_assignment.aks_metrics]
metadata {
name = "container-azm-ms-agentconfig"
namespace = "kube-system"
}
data = {
schema-version = "v1"
config-version = "ver1"
alertable-metrics-configuration-settings = <<EOT
[alertable_metrics_configuration_settings.container_resource_utilization_thresholds]
container_memory_working_set_threshold_percentage = 80.0
EOT
}
// Waiting for omsagent restart & custom metrics preparation
provisioner "local-exec" {
command = "sleep 180"
}
}
data "azurerm_monitor_action_group" "sample" {
resource_group_name = var.alert_actiongroup_rg_name
name = var.alert_actiongroup_name
}
resource "azurerm_monitor_metric_alert" "aks_oom_killed_container_count" {
depends_on = [kubernetes_config_map.oms_agent]
name = "oomKilledContainerCount"
resource_group_name = azurerm_resource_group.sample.name
scopes = [azurerm_kubernetes_cluster.sample.id]
severity = 3
frequency = "PT1M"
window_size = "PT5M"
criteria {
metric_namespace = "Insights.Container/pods"
metric_name = "oomKilledContainerCount"
aggregation = "Average"
operator = "GreaterThan"
threshold = 0
dimension {
name = "kubernetes namespace"
operator = "Include"
values = ["*"]
}
dimension {
name = "controllerName"
operator = "Include"
values = ["*"]
}
}
action {
action_group_id = data.azurerm_monitor_action_group.sample.id
}
}
resource "azurerm_monitor_metric_alert" "aks_memory_ws_exceeds_percentage" {
depends_on = [kubernetes_config_map.oms_agent]
name = "memoryWorkingSetExceededPercentage"
resource_group_name = azurerm_resource_group.sample.name
scopes = [azurerm_kubernetes_cluster.sample.id]
severity = 3
frequency = "PT1M"
window_size = "PT5M"
criteria {
metric_namespace = "Insights.Container/containers"
metric_name = "memoryWorkingSetExceededPercentage"
aggregation = "Average"
operator = "GreaterThan"
threshold = 95
dimension {
name = "kubernetes namespace"
operator = "Include"
values = ["*"]
}
dimension {
name = "controllerName"
operator = "Include"
values = ["*"]
}
}
action {
action_group_id = data.azurerm_monitor_action_group.sample.id
}
}
variable "aks_rg_name" {
type = string
default = "your-aks-resource-group-name"
}
variable "aks_rg_location" {
type = string
default = "your-aks-resource-group-location"
}
variable "aks_cluster_name" {
type = string
default = "your-aks-cluster-name"
}
variable "la_workspace_name" {
type = string
default = "your-log-analytics-workspace-name"
}
variable "la_workspace_rg_name" {
type = string
default = "your-log-analytics-workspace-resource-group-name"
}
variable "alert_actiongroup_name" {
type = string
default = "your-alert-actiongroup-name"
}
variable "alert_actiongroup_rg_name" {
type = string
default = "your-alert-actiongroup-resource-grouop-name"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment