prometheus-adapter故障排查

现象

安装完 kube-prometheus 之后 获取pod信息 获取不到

1
2
3
4
[root@master kube-prometheus-0.14.0]# kubectl top pod 
error: Metrics not available for pod default/php-apache-675bc649f5-tcb2w, age: 1h21m10.222933595s
[root@master kube-prometheus-0.14.0]#

排查思路

检查 apiservice

查询的接口是 v1beta1.metrics.k8s.io
这个功能是依赖 kube-prometheus 中的 prometheus-adapter 组件, 通过查询prometheus中获取到的指标信息.

1
2
3
4
[root@master manifests]# kubectl get apiservice v1beta1.metrics.k8s.io
NAME SERVICE AVAILABLE AGE
v1beta1.metrics.k8s.io monitoring/prometheus-adapter True 64m
[root@master manifests]#

原先 安装的是metrics-service 这个接口的信息是其提供的

1
2
3
$ kubectl get apiservice v1beta1.metrics.k8s.io
NAME SERVICE AVAILABLE AGE
v1beta1.metrics.k8s.io kube-system/metrics-server True 159d

那么可以定位问题出在prometheus-adapter上。

检查prometheus-adapter配置

1
cat prometheusAdapter-deployment.yaml 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.12.0
name: prometheus-adapter
namespace: monitoring
spec:
replicas: 2
selector:
...
...
...
- configMap:
name: adapter-config
name: config

可知prometheus-adapter从prometheus中获取指标数据,接着检查其配置文件configmap:

查询

1
cat prometheusAdapter-configMap.yaml 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
apiVersion: v1
data:
config.yaml: |-
"resourceRules":
"cpu":
"containerLabel": "container"
"containerQuery": |
sum by (<<.GroupBy>>) (
irate (
container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="",pod!=""}[120s]
)
)
"nodeQuery": |
sum by (<<.GroupBy>>) (
1 - irate(
node_cpu_seconds_total{mode="idle"}[60s]
)
* on(namespace, pod) group_left(node) (
node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}
)
)
or sum by (<<.GroupBy>>) (
1 - irate(
windows_cpu_time_total{mode="idle", job="windows-exporter",<<.LabelMatchers>>}[4m]
)
)
"resources":
"overrides":
"namespace":
"resource": "namespace"
"node":
"resource": "node"
"pod":
"resource": "pod"
"memory":
"containerLabel": "container"
"containerQuery": |
sum by (<<.GroupBy>>) (
container_memory_working_set_bytes{<<.LabelMatchers>>,container!="",pod!=""}
)
"nodeQuery": |
sum by (<<.GroupBy>>) (
node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>}
-
node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>}
)
or sum by (<<.GroupBy>>) (
windows_cs_physical_memory_bytes{job="windows-exporter",<<.LabelMatchers>>}
-
windows_memory_available_bytes{job="windows-exporter",<<.LabelMatchers>>}
)
"resources":
"overrides":
"instance":
"resource": "node"
"namespace":
"resource": "namespace"
"pod":
"resource": "pod"
"window": "5m"
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.12.0
name: adapter-config
namespace: monitoring

可知kube top命令所显示的数据应当是由上述配置文件中的promQL提供,kube top命令无法显示数据是因为上述查询语句没有数据返回。

我们开启prometheus的query_log_file https://prometheus.io/docs/guides/query-log/
功能,抓取一下具体的查询语句。kube-prometheus的开启方法见https://github.com/prometheus-operator/prometheus-operator/blob/v0.49.0/Documentation/api.md#prometheusspec,
如下配置将查询记录输出到标准输出。

1
2
3
4
5
6
7
$ cat prometheus-prometheus.yaml 
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
...
spec:
queryLogFile: /dev/stdout

执行kubectl top node后使用kubectl logs prometheus-k8s-0 -c prometheus -n monitoring | grep container_memory_working_set_bytes 抓取到的两个promQL为:

1
2
{"httpRequest":{"clientIP":"10.244.1.179","method":"GET","path":"/api/v1/query"},"params":{"end":"2024-12-30T16:31:11.061Z","query":"sum by (pod,container) (\n  irate (\n      container_cpu_usage_seconds_total{namespace=\"default\",pod=\"php-apache-675bc649f5-tcb2w\",container!=\"\",pod!=\"\"}[120s]\n  )\n)\n","start":"2024-12-30T16:31:11.061Z","step":0},"spanID":"0000000000000000","stats":{"timings":{"evalTotalTime":0.000145861,"resultSortTime":0,"queryPreparationTime":0.000036583,"innerEvalTime":0.00010043,"execQueueTime":0.000006156,"execTotalTime":0.000151328},"samples":{"totalQueryableSamples":0,"peakSamples":0}},"ts":"2024-12-30T16:31:11.065Z"}
{"params":{"end":"2024-12-30T16:31:11.226Z","query":"sum by (cluster, namespace, pod, container) (irate(container_cpu_usage_seconds_total{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"}[5m])) * on (cluster, namespace, pod) group_left (node) topk by (cluster, namespace, pod) (1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=\"\"}))","start":"2024-12-30T16:31:11.226Z","step":0},"ruleGroup":{"file":"/etc/prometheus/rules/prometheus-k8s-rulefiles-0/monitoring-kubernetes-monitoring-rules-2735c203-d4be-43f0-be6c-efa836f82dea.yaml","name":"k8s.rules.container_cpu_usage_seconds_total"},"spanID":"0000000000000000","stats":{"timings":{"evalTotalTime":0.00023298,"resultSortTime":0,"queryPreparationTime":0.000029385,"innerEvalTime":0.000199584,"execQueueTime":0.000009953,"execTotalTime":0.000238641},"samples":{"totalQueryableSamples":29,"peakSamples":58}},"ts":"2024-12-30T16:31:11.229Z"}

将语句拿出来去 prometheus 中验证

发现查询 cpu 与 memory 的条件中增加了

1
container!=""

但是指标中却没有这个标签 .

将这两个标签删除

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
apiVersion: v1
data:
config.yaml: |-
"resourceRules":
"cpu":
"containerLabel": "container"
"containerQuery": |
sum by (<<.GroupBy>>) (
irate (
container_cpu_usage_seconds_total{<<.LabelMatchers>>,pod!=""}[120s]
)
)
"nodeQuery": |
sum by (<<.GroupBy>>) (
1 - irate(
node_cpu_seconds_total{mode="idle"}[60s]
)
* on(namespace, pod) group_left(node) (
node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}
)
)
or sum by (<<.GroupBy>>) (
1 - irate(
windows_cpu_time_total{mode="idle", job="windows-exporter",<<.LabelMatchers>>}[4m]
)
)
"resources":
"overrides":
"namespace":
"resource": "namespace"
"node":
"resource": "node"
"pod":
"resource": "pod"
"memory":
"containerLabel": "container"
"containerQuery": |
sum by (<<.GroupBy>>) (
container_memory_working_set_bytes{<<.LabelMatchers>>,pod!=""}
)
"nodeQuery": |
sum by (<<.GroupBy>>) (
node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>}
-
node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>}
)
or sum by (<<.GroupBy>>) (
windows_cs_physical_memory_bytes{job="windows-exporter",<<.LabelMatchers>>}
-
windows_memory_available_bytes{job="windows-exporter",<<.LabelMatchers>>}
)
"resources":
"overrides":
"instance":
"resource": "node"
"namespace":
"resource": "namespace"
"pod":
"resource": "pod"
"window": "5m"
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.12.0
name: adapter-config
namespace: monitoring

重新部署了一遍. 问题解除了

具体为什么缺失了这个标签. 可能是k8s更新太快了 adapter 与高版本的k8s的适配性没有及时调整