![kubernetes 모니터링(grafana, prometheus, alertmanager)](https://img1.daumcdn.net/thumb/R750x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fb7pIY9%2FbtsEtmvEUE1%2Fdqt41F2xyedmhn2uy244lk%2Fimg.png)
목적
시스템 모니터링 + 장애 발생 + 문제 발생 전 알림을 받기 위함
구성도
도구
kubespray, kubernetes(이하 k8s), grafna, prometheus, alertmanager, python3, uvicorn, FastAPI, UMS
환경
master1, 2, 3
worker1, 2, 3
k8s version - 1.28.5
진행(개발환경에서 진행하기에 각자 환경에 따라 차이가 발생할 수 있습니다.)
k8s cluster가 배포되어 있고 metallb, ingress controller가 있다는 가정하에 진행하겠습니다.
1. SSL 인증서 등록
#dev
$ ls -alR
/home/master01/ssl/:
total 80
drwxrwxr-x 5 master01 master01 4096 Jan 22 13:31 .
drwxr-x--- 8 master01 master01 4096 Jan 22 13:26 ..
drwxrwxr-x 2 master01 master01 4096 Jan 22 13:29 crt
drwxrwxr-x 2 master01 master01 4096 Jan 22 13:30 pem
drwxrwxr-x 2 master01 master01 4096 Jan 22 13:31 pfx
-rw-rw-r-- 1 master01 master01 26243 Jan 22 13:24 SSL.zip
-rw-rw-r-- 1 master01 master01 7075 Feb 22 2023 Wildcard.*_crt.zip
-rw-rw-r-- 1 master01 master01 5352 Feb 22 2023 Wildcard.*_jks.zip
-rw-rw-r-- 1 master01 master01 6287 Feb 22 2023 Wildcard.*_pem.zip
-rw-rw-r-- 1 master01 master01 7029 Feb 22 2023 Wildcard.*_pfx.zip
/home/master01/ssl/crt:
total 28
drwxrwxr-x 2 master01 master01 4096 Jan 22 13:29 .
drwxrwxr-x 5 master01 master01 4096 Jan 22 13:31 ..
-rw-r--r-- 1 master01 master01 1280 Feb 22 2023 CA_GLOBALSIGN_ROOT_CA.crt
-rw-r--r-- 1 master01 master01 3158 Feb 22 2023 ChainFile_ChainBundle.crt
-rw-r--r-- 1 master01 master01 2562 Feb 22 2023 File_Wildcard.*_crt.crt
-rw-r--r-- 1 master01 master01 1678 Feb 22 2023 KeyFile_Wildcard.*_crt.key
-rw-r--r-- 1 master01 master01 6 Feb 22 2023 password.txt
/home/master01/ssl/pem:
total 24
drwxrwxr-x 2 master01 master01 4096 Jan 22 13:30 .
drwxrwxr-x 5 master01 master01 4096 Jan 22 13:31 ..
-rw-r--r-- 1 master01 master01 1678 Feb 22 2023 KeyFile_Wildcard.*_pem.key
-rw-r--r-- 1 master01 master01 6 Feb 22 2023 password.txt
-rw-r--r-- 1 master01 master01 7004 Feb 22 2023 Wildcard.*_pem.pem
/home/master01/ssl/pfx:
total 20
drwxrwxr-x 2 master01 master01 4096 Jan 22 13:31 .
drwxrwxr-x 5 master01 master01 4096 Jan 22 13:31 ..
-rw-r--r-- 1 master01 master01 9 Feb 22 2023 password.txt
-rw-r--r-- 1 master01 master01 6661 Feb 22 2023 Wildcard.*_pfx.pfx
#create secret
$ kubectl create secret tls dev -n monitor --key=/home/master01/ssl/pem/KeyFile_Wildcard.*_pem.key --key=/home/master01/ssl/pem/Wildcard.*_pem.pem
#secret list
$ kubectl get secrets -A
NAMESPACE NAME TYPE DATA AGE
... ... ... 1 14d
monitoring dev kubernetes.io/tls 2 14d
... ... ... 1 14d
인증서 오류 발생
인증서 등록 후 브라우저에 도메인 접속 -> 'Kubernetes Ingress Controller Fake Certificate' 오류가 발생하면 tls 인증서 등록 과정에서 잘못될 확률이 큽니다. 설정에 맞게 다시 등록해주시고 적용하시면 됩니다.
2. kube-prometheus-stack(values.yaml)
$ kubectl create namespace monitoring
$ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
$ helm pull prometheus-community/kube-prometheus-stack
$ tar zxvf kube-prometheus-stack-55.7.0.tgz
$ mv kube-prometheus-stack kube-prometheus-stack-55.7.0
$ cd kube-prometheus-stack-55.7.0
#grafana password 설정
$ vi values.yaml
---
alertmanager:
ingress:
enabled: true
ingressClassName: nginx
annotations:
kubernetes.io/tls-acme: "true"
kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/use-regex: "true"
nginx.ingress.kubernetes.io/rewrite-target: /$2
labels:
app: "monitor"
hosts:
- devtest.test.kr
paths:
- /alertmanager(/|$)(.*)
tls:
- secretName: dev
hosts:
- devtest.test.kr
grafana:
#adminPassword: prom-operator
adminPassword: test1234!@
ingress:
enabled: true
ingressClassName: "nginx"
annotations:
kubernetes.io/tls-acme: "true"
kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/use-regex: "true"
labels:
app: "monitor"
hosts:
- devtest.test.kr
paths:
- /
tls:
- secretName: dev
hosts:
- devtest.test.kr
prometheus:
enabled: true
ingress:
enabled: true
ingressClassName: nginx
annotations:
kubernetes.io/tls-acme: "true"
kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/use-regex: "true"
nginx.ingress.kubernetes.io/rewrite-target: /$2
nginx.ingress.kubernetes.io/proxy-send-timeout: "3600"
nginx.ingress.kubernetes.io/proxy-read-timeout: "3600"
labels:
app: "monitor"
hosts:
- devtest.test.kr
paths:
- /prometheus(/|$)(.*)
tls:
- secretName: dev
hosts:
- devtest.test.kr
prometheusSpec:
retention: 15d
retentionSize: "10GiB"
storageSpec:
#openebs attch
volumeClaimTemplate:
spec:
storageClassName: openebs-hostpath
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 20Gi
3. pod 접속, grafana id, pw 조회
#id, pw
kubectl get secret --namespace monitoring prometheus-grafana -o jsonpath="{.data.admin-user}" | base64 --decode ; echo
kubectl get secret --namespace monitoring prometheus-grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo
#pod 접속
$ kubectl exec --stdin --tty -n monitoring pod/prometheus-stack-grafana-94cb66997-5fq69 -- /bin/bash
#password 변경
$ grafana-cli admin reset-admin-password 'PassW0rd!'
#pod root 접속
$ kubectl exec --stdin --tty -n monitoring pod/prometheus-stack-grafana-94cb66997-5fq69 -- /bin/sh -c "/bin/sh"
4. openebs 설치
host에서 파일을 관리하다보면 host서버가 장애 발생 시 문제가 생길 수 있다. 따라서, openebs, ceph를 이용하여 파일을 관리하면 편하다.(이 부분은 2번 values.yaml의 prometheusSpec: 부분과 이어진다.)
https://github.com/openebs/openebs
https://openebs.io/docs
https://jerryljh.medium.com/openebs-localpv-%EC%98%88%EC%8B%9C-a201148d5978
$ curl -O https://openebs.github.io/charts/openebs-operator-lite.yaml
$ curl -O https://openebs.github.io/charts/openebs-lite-sc.yaml
$ kubectl apply -f openebs-lite-sc.yaml
$ kubectl apply -f openebs-operator-lite.yaml
#default storage
$ kubectl patch storageclass openebs-hostpath -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
$ kubectl get pvc -A
NAMESPACE NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
monitoring prometheus-monitor-kube-prometheus-st-prometheus-db-prometheus-monitor-kube-prometheus-st-prometheus-0 Bound pvc-13781ee1-f5d5-4cba-bdd0-c3c90c51e464 100Gi RWO openebs-hostpath 17h
#worker node path
$ ls -al /var/openebs/local/pvc*
4-1. openebs, ceph를 활용하고 싶지 않을때(보안에 주의해야한다. 개발환경에서만 활용해야함)
#https://kubesec.io/basics/containers-securitycontext-privileged-true/
$ vi values.yaml
---
prometheus:
prometheusSpec:
securityContext:
privileged: true
#runASGroup: 2000
#runAsNonRoot: true
#runAsUser: 1000
#fsGroup: 2000
#seccompProfile:
#type: ReuntimeDefault
5. 파일 확인
root@master1:~/monitoring/kube-prometheus-stack-55.7.0# kubectl get namespace
NAME STATUS AGE
default Active 31d
ingress-nginx Active 23d
kube-node-lease Active 31d
kube-public Active 31d
kube-system Active 31d
metallb-system Active 30d
monitoring Active 18d
openebs Active 11d
$ kubectl exec --stdin --tty -n monitor
$ kubectl exec --stdin --tty -n monitor pods/prometheus-monitor-kube-prometheus-st-prometheus-0 -- /bin/sh
/prometheus $ ls
chunks_head data prometheus.yml prometheus.yml.back2 queries.active wal
#worker node 접속
$ cd /var/openebs/local/pvc/
$ vi prometheus.yml
---
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
scrape_interval: 15s
static_configs:
- targets: ["localhost:9090"]
- job_name: 'vpc'
scrape_interval: 15s
static_configs:
- targets: ['x.x.x.x:9100', 'x.x.x.x:9100']
포스팅이 좋았다면 "좋아요❤️" 또는 "구독👍🏻" 해주세요!