k8s中部署prometheus+grafana

Prometheus概述

Prometheus是一个开源的服务监控系统和时序数据库,其提供了通用的数据模型和快捷数据采集、存储和查询接口。它的核心组件Prometheus服务器定期从静态配置的监控目标或者基于服务发现自动配置的目标中进行拉取数据,新拉取到的数据大于配置的内存缓存区时,数据就会持久化到存储设备当中。

Prometheus部署

  • 下载
1
git clone https://github.com/iKubernetes/k8s-prom.git
  • 部署node_exporter
1
2
3
4
5
6
7
8
9
cd k8s-prom
# 创建命名空间
kubectl apply -f namespace.yaml # kubectl create ns prom
# 部署node_exporter
kubectl apply -f node_exporter/
# 这里要注意node-exporter-ds.yaml,spec下master节点的污点容忍度,否则不会再master节点创建pod
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: prometheus-node-exporter
namespace: prom
labels:
app: prometheus
component: node-exporter
spec:
selector:
matchLabels:
app: prometheus
component: node-exporter
template:
metadata:
name: prometheus-node-exporter
labels:
app: prometheus
component: node-exporter
spec:
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
containers:
- image: prom/node-exporter:v0.15.2
name: prometheus-node-exporter
ports:
- name: prom-node-exp
containerPort: 9100
hostPort: 9100
hostNetwork: true
hostPID: truez
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
name: prometheus-node-exporter
namespace: prom
labels:
app: prometheus
component: node-exporter
spec:
clusterIP: None
ports:
- name: prometheus-node-exporter
port: 9100
protocol: TCP
selector:
app: prometheus
component: node-exporter
type: ClusterIP
  • 部署Prometheus-server
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
cat prometheus-rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: prom
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: prom
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
[root@k8s-master01 prometheus]# cat prometheus-cfg.yaml 
---
kind: ConfigMap
apiVersion: v1
metadata:
labels:
app: prometheus
name: prometheus-config
namespace: prom
data:
prometheus.yml: |
global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 1m

scrape_configs:
- job_name: 'smartops-es'
static_configs:
- targets: ['192.168.102.12:9114']
labels:
nodename: es-master1
- targets: ['192.168.102.13:9114']
labels:
nodename: es-master2
- targets: ['192.168.102.14:9114']
labels:
nodename: es-master3
- targets: ['192.168.102.15:9114']
labels:
nodename: es-data1
- targets: ['192.168.102.16:9114']
labels:
nodename: es-data2
- targets: ['192.168.102.17:9114']
labels:
nodename: es-data3
- targets: ['192.168.102.18:9114']
labels:
nodename: es-data4
- targets: ['192.168.102.19:9114']
labels:
nodename: es-data5

- job_name: 'smartops-redis'
static_configs:
- targets: ['192.168.102.20:9121']
labels:
nodename: redis

- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https

- job_name: 'kubernetes-nodes'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node

relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics

- job_name: 'kubernetes-cadvisor'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

kubernetes_sd_configs:
- role: node

relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor

- job_name: 'kubernetes-service-endpoints'

kubernetes_sd_configs:
- role: endpoints

relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name

- job_name: 'kubernetes-pods'
honor_labels: false
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
relabel_configs:
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: pod
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (.+)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
[root@k8s-master01 prometheus]# cat prometheus-pvc.yaml 
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-pvc
namespace: prom
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 150Gi
storageClassName: smartops-nfs-storage

[root@k8s-master01 prometheus]# kubectl get sc
NAME PROVISIONER AGE
smartops-nfs-storage (default) smartops/nfs 2y
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
[root@k8s-master01 prometheus]# cat prometheus-deploy.yaml 
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus-server
namespace: prom
labels:
app: prometheus
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
component: server
#matchExpressions:
#- {key: app, operator: In, values: [prometheus]}
#- {key: component, operator: In, values: [server]}
template:
metadata:
labels:
app: prometheus
component: server
annotations:
prometheus.io/scrape: 'false'
spec:
serviceAccountName: prometheus
containers:
- name: prometheus
image: prom/prometheus:v2.2.1
imagePullPolicy: Always
command:
- prometheus
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention=720h
ports:
- containerPort: 9090
protocol: TCP
resources:
limits:
memory: 2Gi
volumeMounts:
- mountPath: /etc/prometheus/prometheus.yml
name: prometheus-config
subPath: prometheus.yml
- mountPath: /prometheus/
name: prometheus-storage-volume
volumes:
- name: prometheus-config
configMap:
name: prometheus-config
items:
- key: prometheus.yml
path: prometheus.yml
mode: 0644
- name: prometheus-storage-volume
persistentVolumeClaim:
claimName: prometheus-pvc
readOnly: false
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: prom
labels:
app: prometheus
spec:
type: NodePort
ports:
- port: 9090
targetPort: 9090
nodePort: 30090
protocol: TCP
selector:
app: prometheus
component: server
  • 部署kube-state-metrics
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
[root@k8s-master01 kube-state-metrics]# cat kube-state-metrics-rbac.yaml 
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-state-metrics
namespace: prom
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kube-state-metrics
rules:
- apiGroups: [""]
resources: ["nodes", "pods", "services", "resourcequotas", "replicationcontrollers", "limitranges", "persistentvolumeclaims", "persistentvolumes", "namespaces", "endpoints"]
verbs: ["list", "watch"]
- apiGroups: ["extensions"]
resources: ["daemonsets", "deployments", "replicasets"]
verbs: ["list", "watch"]
- apiGroups: ["apps"]
resources: ["statefulsets"]
verbs: ["list", "watch"]
- apiGroups: ["batch"]
resources: ["cronjobs", "jobs"]
verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
resources: ["horizontalpodautoscalers"]
verbs: ["list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: prom
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
[root@k8s-master01 kube-state-metrics]# cat kube-state-metrics-deploy.yaml 
apiVersion: apps/v1
kind: Deployment
metadata:
name: kube-state-metrics
namespace: prom
spec:
replicas: 1
selector:
matchLabels:
app: kube-state-metrics
template:
metadata:
labels:
app: kube-state-metrics
spec:
serviceAccountName: kube-state-metrics
containers:
- name: kube-state-metrics
image: gcr.io/google_containers/kube-state-metrics-amd64:v1.3.1
ports:
- containerPort: 8080
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
name: kube-state-metrics
namespace: prom
labels:
app: kube-state-metrics
spec:
ports:
- name: kube-state-metrics
port: 8080
protocol: TCP
selector:
app: kube-state-metrics
  • 制作证书
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
[root@k8s-master pki]# (umask 077; openssl genrsa -out serving.key 2048)
Generating RSA private key, 2048 bit long modulus
......................+++
....+++
e is 65537 (0x10001)
[root@k8s-master pki]# openssl req -new -key serving.key -out serving.csr -subj "/CN=serving"
[root@k8s-master pki]# openssl x509 -req -in serving.csr -CA ./ca.crt -CAkey ./ca.key -CAcreateserial -out serving.crt -days 3650
Signature ok
subject=/CN=serving
Getting CA Private Key

[root@k8s-master pki]# kubectl create secret generic cm-adapter-serving-certs --from-file=serving.crt=./serving.crt --from-file=serving.key -n prom
secret/cm-adapter-serving-certs created

[root@k8s-master pki]# kubectl get secret -n prom
NAME TYPE DATA AGE
cm-adapter-serving-certs Opaque 2 20s
  • 部署k8s-prometheus-adapter
1
2
3
4
5
6
# 这里自带的custom-metrics-apiserver-deployment.yaml和custom-metrics-config-map.yaml有点问题,需要下载k8s-prometheus-adapter项目中的这2个文件,替换原文件

wget https://raw.githubusercontent.com/DirectXMan12/k8s-prometheus-adapter/master/deploy/manifests/custom-metrics-apiserver-deployment.yaml
wget https://raw.githubusercontent.com/DirectXMan12/k8s-prometheus-adapter/master/deploy/manifests/custom-metrics-config-map.yaml
# 修改2个文件的名称空间为prom
kubectl apply -f k8s-prometheus-adapter/
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
[root@k8s-master k8s-prom]# kubectl get pods -n prom
NAME READY STATUS RESTARTS AGE
custom-metrics-apiserver-65f545496-l5md9 1/1 Running 0 7m
kube-state-metrics-78fc9fc745-g66p8 1/1 Running 0 40m
prometheus-node-exporter-6srrq 1/1 Running 0 1h
prometheus-node-exporter-fftmc 1/1 Running 0 1h
prometheus-node-exporter-qlr8d 1/1 Running 0 1h
prometheus-server-66cbd4c6b-j9lqr 1/1 Running 0 53m

[root@k8s-master k8s-prom]# kubectl api-versions |grep custom
custom.metrics.k8s.io/v1beta1

[root@k8s-master ~]# kubectl get svc -n prom
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
custom-metrics-apiserver ClusterIP 10.99.14.141 <none> 443/TCP 11h
kube-state-metrics ClusterIP 10.107.23.237 <none> 8080/TCP 11h
prometheus NodePort 10.96.65.72 <none> 9090:30090/TCP 11h
prometheus-node-exporter ClusterIP None <none> 9100/TCP 11h

Grafana数据展示

  • 部署grafana
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
[root@k8s-master01 grafana]# cat grafana-cfg.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-config
namespace: prom
data:
grafana.ini: |
[smtp]
enabled = true
host = smtp.xxxx.com:25
user = ops@xxxx.com
password = AC4xxxxx070
skip_verify = true
from_address = ops@xxxx.com
from_name = Smartops-Grafana
[alerting]
enabled = true
execute_alerts = true
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
[root@k8s-master01 grafana]# cat grafana-dp.yaml 
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-pvc
namespace: prom
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
storageClassName: smartops-nfs-storage
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: monitoring-grafana
namespace: prom #修改名称空间
spec:
replicas: 1
selector:
matchLabels:
task: monitoring
k8s-app: grafana
template:
metadata:
labels:
task: monitoring
k8s-app: grafana
spec:
containers:
- name: grafana
#image: registry.cn-hangzhou.aliyuncs.com/google_containers/heapster-grafana-amd64:v5.0.4
image: grafana/grafana
ports:
- containerPort: 3000
protocol: TCP
volumeMounts:
- mountPath: /etc/grafana/grafana.ini
name: config
subPath: grafana.ini
- mountPath: /etc/ssl/certs
name: ca-certificates
readOnly: true
- mountPath: /var/lib/grafana/
name: grafana-storage
env: #这里使用的是原先的heapster的grafana的配置文件,需要注释掉这个环境变量
#- name: INFLUXDB_HOST
# value: monitoring-influxdb
- name: GF_SERVER_HTTP_PORT
value: "3000"
- name: GF_AUTH_BASIC_ENABLED
value: "false"
- name: GF_AUTH_ANONYMOUS_ENABLED
value: "true"
- name: GF_AUTH_ANONYMOUS_ORG_ROLE
value: Admin
- name: GF_SERVER_ROOT_URL
value: /
volumes:
- configMap:
defaultMode: 420
name: grafana-config
name: config
- name: ca-certificates
hostPath:
path: /etc/ssl/certs
- name: grafana-storage
persistentVolumeClaim:
claimName: grafana-pvc
readOnly: false
---
apiVersion: v1
kind: Service
metadata:
labels:
kubernetes.io/cluster-service: 'true'
kubernetes.io/name: monitoring-grafana
name: monitoring-grafana
namespace: prom
spec:
randomly-generated port
type: NodePort
ports:
- port: 80
targetPort: 3000
selector:
k8s-app: grafana

-------------本文结束感谢您的阅读-------------
原创技术分享,感谢您的支持。