最近更新时间:2024.02.28 20:04:51
首次发布时间:2023.07.24 17:34:24
容器服务支持监控集群控制面核心组件,包括:ApiServer、Etcd、Scheduler 和 ClusterAutoscaler。本文为您介绍如何配置和查看控制面组件的监控信息。
说明
创建集群时的其他配置,请参见 创建集群。
配置控制平面组件监控后,您可以查看控制面组件的指标大盘。设置查询的时间段,并指定刷新方式(手动刷新、自动刷新)。
配置控制平面组件监控后,您可以查看控制面组件的指标大盘。设置查询的时间段,并指定刷新方式(手动刷新、自动刷新)。
配置控制平面组件监控后,您可以查看控制面组件的指标大盘。设置查询的时间段,选择查询的实例,并指定刷新方式(手动刷新、自动刷新)。
配置控制平面组件监控后,您可以查看控制面组件的指标大盘。设置查询的时间段,选择查询的实例,并指定刷新方式(手动刷新、自动刷新)。
控制面组件监控大盘的指标清单如下表所示。
大盘分类 | 大盘名称 | PromQL 语句 |
---|---|---|
ApiServer | APIServer QPS | sum(irate(apiserver_request_total{cluster="$clusterId"}[1m])) |
读请求成功率 | sum(irate(apiserver_request_total{cluster="$clusterId",code=~"20.*",verb=~"GET|LIST"}[1m]))/sum(irate(apiserver_request_total{cluster="$clusterId",verb=~"GET|LIST"}[1m]))*100 | |
写请求成功率 | sum(irate(apiserver_request_total{cluster="$clusterId",code=~"20.*",verb!~"GET|LIST|WATCH|CONNECT"}[5m]))/sum(irate(apiserver_request_total{cluster="$clusterId",verb!~"GET|LIST|WATCH|CONNECT"}[5m]))*100 | |
正在处理的读请求数(当前值) | sum(apiserver_current_inflight_requests{cluster="$clusterId",request_kind="readOnly"}) | |
正在处理的写请求数(当前值) | sum(apiserver_current_inflight_requests{cluster="$clusterId",request_kind="mutating"}) | |
废弃 api 的请求数 | sum(apiserver_requested_deprecated_apis{cluster="$clusterId"}) | |
GET 的请求延迟 [P90] | histogram_quantile(0.90, sum(irate(apiserver_request_duration_seconds_bucket{cluster="$clusterId",verb="GET",resource!="",subresource!~"log|proxy"}[1m])) by ( verb, instance,resource, le)) | |
LIST 的请求延迟 [P90] | histogram_quantile(0.90, sum(irate(apiserver_request_duration_seconds_bucket{cluster="$clusterId",verb="LIST",resource!="",subresource!~"log|proxy"}[1m])) by ( verb, instance,resource, le)) | |
写请求延迟 [P90] | histogram_quantile(0.90, sum(irate(apiserver_request_duration_seconds_bucket{cluster="$clusterId",verb!~"GET|WATCH|LIST|CONNECT",resource!="",subresource!~"log|proxy"}[1m])) by ( verb, instance,resource, le)) | |
APIServer 非 2XX 返回值的读请求 | sum by (instance, resource)(increase(apiserver_request_total{cluster="$clusterId",verb="GET", resource!="", code!~"2.."}[1m]) ) | |
APIServer 非 2XX 返回值的写请求 | sum by (instance, resource)(increase(apiserver_request_total{cluster="$clusterId",verb=~"^(POST|PUT|PATCH|DELETE)$", resource!="", code!~"2.."}[1m]) ) | |
APIServer 在处理的读请求数 | sum(apiserver_current_inflight_requests{cluster="$clusterId",request_kind="readOnly"})by(instance) | |
APIServer 在处理的写请求数 | sum(apiserver_current_inflight_requests{cluster="$clusterId",request_kind="mutating"})by(instance) | |
Admit 准入控制器时延 [P90] | histogram_quantile(0.9, sum by(operation, le, type, rejected) (irate(apiserver_admission_controller_admission_duration_seconds_bucket{cluster="$clusterId",type="admit"}[1m]))) | |
Validate 准入控制器时延 [P90] | histogram_quantile(0.9, sum by(operation, le, type, rejected) (irate(apiserver_admission_controller_admission_duration_seconds_bucket{cluster="$clusterId",type="validate"}[5m]))) | |
Admit 准入 Webhook 时延 [P90] | histogram_quantile(0.90, sum by(operation, type,le, rejected) (rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster="$clusterId",type="admit"}[5m]))) | |
Validate 准入 Webhook 时延 [P90] | histogram_quantile(0.90, sum by(operation,le, type, rejected) (rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster="$clusterId",type="validating"}[5m]))) | |
UserAgent 请求 QPS Top 10 | topk(10,sum(rate(apiserver_request_useragent_total{cluster="$ClusterId"}[5m])) by(useragent,verb,resource)) | |
UserAgent List 请求 QPS Top 10 | topk(10,sum(rate(apiserver_request_useragent_total{cluster="$ClusterId",verb="LIST"}[5m])) by(useragent,resource)) | |
Etcd | has leader | etcd_server_has_leader{cluster="$clusterId"} |
Backend commit 平均时延 [P90] | histogram_quantile(0.9, avg(rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster="$clusterId"}[5m]))by (instance, le)) | |
Proposal failed | sum(etcd_server_proposals_failed_total{cluster="$clusterId"}) | |
Proposal pending | sum(etcd_server_proposals_pending{cluster="$clusterId"}) | |
Backend commit 成功率 | sum(etcd_server_proposals_applied_total{cluster="$clusterId"})by(instance)/sum(etcd_server_proposals_committed_total{cluster="$clusterId"})by(instance)*100 | |
Backend commit 时延 [P90] | histogram_quantile(0.9, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster="$clusterId"}[5m]))by (instance, le)) | |
使用量的趋势 | etcd_mvcc_db_total_size_in_use_in_bytes{cluster="$clusterId"} | |
WAL fsync 操作耗时 | histogram_quantile(0.90, rate(etcd_disk_wal_fsync_duration_seconds_bucket{cluster="$clusterId"}[5m])) | |
Proposal failed 数量趋势 | etcd_server_proposals_failed_total{cluster="$clusterId"} | |
Proposal pending 数量趋势 | etcd_server_proposals_pending{cluster="$clusterId"} | |
Proposal commit - apply 数量趋势 | abs(etcd_server_proposals_committed_total{cluster="$clusterId"}-etcd_server_proposals_applied_total{cluster="$clusterId"}) | |
Scheduler | 存活的调度器实例 | count(sum(scheduler_pod_scheduling_attempts_sum{cluster="$clusterId"})by(instance)) |
处于 Pending Phase 的 Pods 数 | sum(scheduler_pending_pods{cluster="$clusterId",instance=~"$instances"}) | |
请求 APIServer 的 P90 时延 | sum(histogram_quantile(0.9, sum(rate(rest_client_request_duration_seconds_bucket{cluster="$clusterId"}[5m])) by (verb,url,le))) | |
尝试抢占的次数 | sum(scheduler_preemption_attempts_total{cluster="$clusterId",instance=~"$instances"}) | |
缓存中的资源数量 | sum(scheduler_scheduler_cache_size{cluster="$clusterId",type="assumed_pods",instance=~"$instances"})by(type) | |
Pending Pods 数量 | sum(scheduler_pending_pods{cluster="$clusterId",instance=~"$instances"})by(queue) | |
各阶段 Pod 调度数量 | sum(increase(scheduler_scheduling_algorithm_duration_seconds_count{cluster="$clusterId",instance=~"$instances"}[1m])) | |
各阶段 Pod 调度耗时 | histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster="$clusterId",instance=~"$instances"}[5m])) by (le)) | |
请求 APIServer 的 QPS | sum(rate(rest_client_requests_total{cluster="$clusterId"}[1m])) by (method,code) | |
请求 APIServer 的时延趋势图 [P90] | histogram_quantile(0.9, sum(rate(rest_client_request_duration_seconds_bucket{cluster="$clusterId"}[1m])) by (verb,url,le)) | |
ClusterAutoscaler | 集群伸缩状态 | max(cluster_autoscaler_cluster_safe_to_autoscale{cluster="$ClusterId"}) or vector(0) |
是否处于缩容冷却 | sum(cluster_autoscaler_scale_down_in_cooldown{cluster="$ClusterId"}) or vector(0) | |
最近检查扩容时间 | 1000*max(cluster_autoscaler_last_activity{cluster="$ClusterId",activity="scaleUp"}) | |
最近检查缩容时间 | 1000*max(cluster_autoscaler_last_activity{cluster="$ClusterId",activity="scaleDown"}) | |
Unschedulable Pod | sum(cluster_autoscaler_unschedulable_pods_count{cluster="$ClusterId"})by(cluster) | |
count(kube_pod_status_phase{cluster="$ClusterId",phase="Running"}) | ||
弹性伸缩耗时[P99] | histogram_quantile(0.99, sum(rate(cluster_autoscaler_function_duration_seconds_bucket{cluster="$ClusterId",function="main"}[1m])) by (le)) | |
CA 添加节点数 | sum(increase(cluster_autoscaler_scaled_up_nodes_total{cluster="$ClusterId"}[15s])) | |
sum(increase(cluster_autoscaler_scaled_up_gpu_nodes_total{cluster="$ClusterId"}[15s])) | ||
CA 移除节点数 | sum(increase(cluster_autoscaler_scaled_down_nodes_total{cluster="$ClusterId"}[15s])) by (reason) | |
sum(increase(cluster_autoscaler_scaled_down_gpu_nodes_total{cluster="$ClusterId"}[15s]))by(reason) | ||
CA 驱逐 Pod 数 | sum(increase(cluster_autoscaler_evicted_pods_total{cluster="$ClusterId"}[15s])) | |
CA 扩容失败次数 | sum(increase(cluster_autoscaler_failed_scale_ups_total{cluster="$ClusterId"}[15s]))by(reason) |
说明
如果您需要在托管 Prometheus 中的 Explore 功能或告警中心使用上述 PromQL 语句查看具体的指标或配置告警,请修改或删除语句中关于集群、节点、容器组的变量。例如:将 cluster=~"$Cluster"
参数中的$Cluster
变量修改为具体的集群 ID ,或直接删除该参数。
您可以使用托管 Prometheus 的 Explore 功能来快速查询和展示指标数据。详情请参见 指标查询。
您可以在托管 Prometheus 的告警中心配置集群相关告警,包括: