IT/monitoring

kubernetes 모니터링(grafana, prometheus, alertmanager)

주니- 2024. 2. 5. 09:53

목적

시스템 모니터링 + 장애 발생 + 문제 발생 전 알림을 받기 위함

 

구성도

도구

kubespray, kubernetes(이하 k8s), grafna, prometheus, alertmanager, python3, uvicorn, FastAPI, UMS

 

환경

master1, 2, 3

worker1, 2, 3

k8s version - 1.28.5

 

진행(개발환경에서 진행하기에 각자 환경에 따라 차이가 발생할 수 있습니다.)

k8s cluster가 배포되어 있고 metallb, ingress controller가 있다는 가정하에 진행하겠습니다.

 

1. SSL 인증서 등록

#dev
$ ls -alR
/home/master01/ssl/:
total 80
drwxrwxr-x 5 master01 master01  4096 Jan 22 13:31 .
drwxr-x--- 8 master01 master01  4096 Jan 22 13:26 ..
drwxrwxr-x 2 master01 master01  4096 Jan 22 13:29 crt
drwxrwxr-x 2 master01 master01  4096 Jan 22 13:30 pem
drwxrwxr-x 2 master01 master01  4096 Jan 22 13:31 pfx
-rw-rw-r-- 1 master01 master01 26243 Jan 22 13:24 SSL.zip
-rw-rw-r-- 1 master01 master01  7075 Feb 22  2023 Wildcard.*_crt.zip
-rw-rw-r-- 1 master01 master01  5352 Feb 22  2023 Wildcard.*_jks.zip
-rw-rw-r-- 1 master01 master01  6287 Feb 22  2023 Wildcard.*_pem.zip
-rw-rw-r-- 1 master01 master01  7029 Feb 22  2023 Wildcard.*_pfx.zip

/home/master01/ssl/crt:
total 28
drwxrwxr-x 2 master01 master01 4096 Jan 22 13:29 .
drwxrwxr-x 5 master01 master01 4096 Jan 22 13:31 ..
-rw-r--r-- 1 master01 master01 1280 Feb 22  2023 CA_GLOBALSIGN_ROOT_CA.crt
-rw-r--r-- 1 master01 master01 3158 Feb 22  2023 ChainFile_ChainBundle.crt
-rw-r--r-- 1 master01 master01 2562 Feb 22  2023 File_Wildcard.*_crt.crt
-rw-r--r-- 1 master01 master01 1678 Feb 22  2023 KeyFile_Wildcard.*_crt.key
-rw-r--r-- 1 master01 master01    6 Feb 22  2023 password.txt

/home/master01/ssl/pem:
total 24
drwxrwxr-x 2 master01 master01 4096 Jan 22 13:30 .
drwxrwxr-x 5 master01 master01 4096 Jan 22 13:31 ..
-rw-r--r-- 1 master01 master01 1678 Feb 22  2023 KeyFile_Wildcard.*_pem.key
-rw-r--r-- 1 master01 master01    6 Feb 22  2023 password.txt
-rw-r--r-- 1 master01 master01 7004 Feb 22  2023 Wildcard.*_pem.pem

/home/master01/ssl/pfx:
total 20
drwxrwxr-x 2 master01 master01 4096 Jan 22 13:31 .
drwxrwxr-x 5 master01 master01 4096 Jan 22 13:31 ..
-rw-r--r-- 1 master01 master01    9 Feb 22  2023 password.txt
-rw-r--r-- 1 master01 master01 6661 Feb 22  2023 Wildcard.*_pfx.pfx

#create secret
$ kubectl create secret tls dev -n monitor --key=/home/master01/ssl/pem/KeyFile_Wildcard.*_pem.key --key=/home/master01/ssl/pem/Wildcard.*_pem.pem

#secret list
$ kubectl get secrets -A
NAMESPACE        NAME                          TYPE                DATA   AGE
...              ...                           ...                 1      14d
monitoring       dev                           kubernetes.io/tls   2      14d
...              ...                           ...                 1      14d

 

인증서 오류 발생

인증서 등록 후 브라우저에 도메인 접속 -> 'Kubernetes Ingress Controller Fake Certificate' 오류가 발생하면 tls 인증서 등록 과정에서 잘못될 확률이 큽니다. 설정에 맞게 다시 등록해주시고 적용하시면 됩니다.

 

2. kube-prometheus-stack(values.yaml)

$ kubectl create namespace monitoring
$ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
$ helm pull prometheus-community/kube-prometheus-stack
$ tar zxvf kube-prometheus-stack-55.7.0.tgz
$ mv kube-prometheus-stack kube-prometheus-stack-55.7.0
$ cd kube-prometheus-stack-55.7.0

#grafana password 설정
$ vi values.yaml
---
alertmanager:
  ingress:
    enabled: true
    ingressClassName: nginx

    annotations:
      kubernetes.io/tls-acme: "true"
      kubernetes.io/ssl-redirect: "true"
      nginx.ingress.kubernetes.io/use-regex: "true"
      nginx.ingress.kubernetes.io/rewrite-target: /$2
    labels:
      app: "monitor"
    hosts:
      - devtest.test.kr
    paths:
      - /alertmanager(/|$)(.*)
    tls:
      - secretName: dev
        hosts:
        - devtest.test.kr

grafana:
  #adminPassword: prom-operator
  adminPassword: test1234!@
  ingress:
    enabled: true
    ingressClassName: "nginx"
    annotations:
      kubernetes.io/tls-acme: "true"
      kubernetes.io/ssl-redirect: "true"
      nginx.ingress.kubernetes.io/use-regex: "true"
    labels:
      app: "monitor"
    hosts:
      - devtest.test.kr
    paths:
      - /
    tls:
      - secretName: dev
        hosts:
        - devtest.test.kr

prometheus:
  enabled: true
  ingress:
    enabled: true
    ingressClassName: nginx
    annotations:
      kubernetes.io/tls-acme: "true"
      kubernetes.io/ssl-redirect: "true"
      nginx.ingress.kubernetes.io/use-regex: "true"
      nginx.ingress.kubernetes.io/rewrite-target: /$2
      nginx.ingress.kubernetes.io/proxy-send-timeout: "3600"
      nginx.ingress.kubernetes.io/proxy-read-timeout: "3600"
    labels:
      app: "monitor"
    hosts:
      - devtest.test.kr
    paths:
      - /prometheus(/|$)(.*)
    tls:
      - secretName: dev
        hosts:
        - devtest.test.kr
  prometheusSpec:
    retention: 15d
    retentionSize: "10GiB"
    storageSpec:
    #openebs attch
    volumeClaimTemplate:
        spec:
          storageClassName: openebs-hostpath
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: 20Gi

 

3. pod 접속, grafana id, pw 조회

#id, pw
kubectl get secret --namespace monitoring prometheus-grafana -o jsonpath="{.data.admin-user}" | base64 --decode ; echo
kubectl get secret --namespace monitoring prometheus-grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo

#pod 접속
$ kubectl exec --stdin --tty -n monitoring pod/prometheus-stack-grafana-94cb66997-5fq69 -- /bin/bash
#password 변경
$ grafana-cli admin reset-admin-password 'PassW0rd!'

#pod root 접속
$ kubectl exec --stdin --tty -n monitoring pod/prometheus-stack-grafana-94cb66997-5fq69 -- /bin/sh -c "/bin/sh"

 

4. openebs 설치

host에서 파일을 관리하다보면 host서버가 장애 발생 시 문제가 생길 수 있다. 따라서, openebs, ceph를 이용하여 파일을 관리하면 편하다.(이 부분은 2번 values.yaml의 prometheusSpec: 부분과 이어진다.)

https://github.com/openebs/openebs
https://openebs.io/docs
https://jerryljh.medium.com/openebs-localpv-%EC%98%88%EC%8B%9C-a201148d5978

$ curl -O https://openebs.github.io/charts/openebs-operator-lite.yaml
$ curl -O https://openebs.github.io/charts/openebs-lite-sc.yaml

$ kubectl apply -f openebs-lite-sc.yaml
$ kubectl apply -f openebs-operator-lite.yaml

#default storage
$ kubectl patch storageclass openebs-hostpath -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'

$ kubectl get pvc -A
NAMESPACE      NAME                                                                                                     STATUS   VOLUME                                     CAPACITY   ACCESS MODES   STORAGECLASS       AGE
monitoring     prometheus-monitor-kube-prometheus-st-prometheus-db-prometheus-monitor-kube-prometheus-st-prometheus-0   Bound    pvc-13781ee1-f5d5-4cba-bdd0-c3c90c51e464   100Gi      RWO            openebs-hostpath   17h

#worker node path
$ ls -al /var/openebs/local/pvc*

 

4-1. openebs, ceph를 활용하고 싶지 않을때(보안에 주의해야한다. 개발환경에서만 활용해야함)

#https://kubesec.io/basics/containers-securitycontext-privileged-true/

$ vi values.yaml
---
prometheus:
  prometheusSpec:
    securityContext:
      privileged: true
      #runASGroup: 2000
      #runAsNonRoot: true
      #runAsUser: 1000
      #fsGroup: 2000
      #seccompProfile:
        #type: ReuntimeDefault

 

5. 파일 확인

root@master1:~/monitoring/kube-prometheus-stack-55.7.0# kubectl get namespace
NAME              STATUS   AGE
default           Active   31d
ingress-nginx     Active   23d
kube-node-lease   Active   31d
kube-public       Active   31d
kube-system       Active   31d
metallb-system    Active   30d
monitoring        Active   18d
openebs           Active   11d

$ kubectl exec --stdin --tty -n monitor
$ kubectl exec --stdin --tty -n monitor pods/prometheus-monitor-kube-prometheus-st-prometheus-0 -- /bin/sh
/prometheus $ ls
chunks_head           data                  prometheus.yml        prometheus.yml.back2  queries.active        wal

#worker node 접속
$ cd /var/openebs/local/pvc/
$ vi prometheus.yml
---
# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
          # - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
    scrape_interval: 15s
    static_configs:
      - targets: ["localhost:9090"]

  - job_name: 'vpc'
    scrape_interval: 15s
    static_configs:
      - targets: ['x.x.x.x:9100', 'x.x.x.x:9100']