- 遍历
node-exporter
服务下的所有ServiceID
。 - 检查每个服务的健康状态:
- 如果服务健康,调用 Consul API 将其状态更新为
passing
。 - 如果服务不健康,记录失败时间,并在超过 10 小时后将其状态更新为
critical
- 如果服务健康,调用 Consul API 将其状态更新为
#!/bin/bash
# Consul 地址
CONSUL_HTTP_ADDR="http://192.168.137.150:8500"
CONSUL_TOKEN="XXXXX"
# 遍历所有 node-exporter 服务实例
for service_id in $(curl -s --header "X-Consul-Token: ${CONSUL_TOKEN}" \
${CONSUL_HTTP_ADDR}/v1/catalog/service/node-exporter | jq -r '.[].ServiceID'); do
echo "Checking service: $service_id"
# 获取服务对应的 CheckID
check_id=$(curl -s --header "X-Consul-Token: ${CONSUL_TOKEN}" \
"${CONSUL_HTTP_ADDR}/v1/agent/checks" | jq -r ".[] | select(.ServiceID==\"$service_id\") | .CheckID")
if [[ -z "$check_id" ]]; then
echo "Warning: No check ID found for service $service_id, skipping..."
continue
fi
# 检查 node-exporter 是否存活
if curl -s --connect-timeout 3 "http://${service_id}/metrics" | grep -q "node_exporter"; then
echo "Service $service_id is healthy"
# 发送 TTL 心跳,表示服务正常
curl -X PUT --header "X-Consul-Token: ${CONSUL_TOKEN}" \
"${CONSUL_HTTP_ADDR}/v1/agent/check/pass/${check_id}"
# 如果之前已经记录了失败时间,删除它
rm -f "/tmp/${service_id}_fail_time"
else
echo "Service $service_id is unreachable"
# 记录失败时间
FAIL_FILE="/tmp/${service_id}_fail_time"
if [ ! -f "$FAIL_FILE" ]; then
date +%s > "$FAIL_FILE"
fi
FAIL_TIME=$(cat "$FAIL_FILE")
CURRENT_TIME=$(date +%s)
# 计算失败时长(秒)
DIFF_TIME=$((CURRENT_TIME - FAIL_TIME))
if [ "$DIFF_TIME" -ge 36000 ]; then
echo "Service $service_id has been down for more than 10 hours, marking as critical"
# 发送 Consul 健康检查 `fail`
curl -X PUT --header "X-Consul-Token: ${CONSUL_TOKEN}" \
"${CONSUL_HTTP_ADDR}/v1/agent/check/fail/${check_id}"
else
echo "Service $service_id is down but within 10-hour grace period"
fi
fi
done
这样就可以实现告警自动化区分和处理
No Comments