Files
KnowledgeBase/docker/monitor-services.sh
lizhuoran 3c206e9e06 feat: 新增 Docker 部署支持、Swoole/Octane 集成及相关优化
- 添加 Dockerfile 与多套 docker-compose 配置(开发/生产环境)
- 集成 Laravel Octane (Swoole) 提升性能
- 新增健康检查、监控脚本及部署文档
- 新增 Docker 镜像离线导入包(MySQL/Redis/Meilisearch)
- 优化文档转换、预览服务及队列任务
- 添加 CreateAdminUser 命令与路由健康检查接口
- 新增 Swoole 队列兼容性测试套件

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-28 15:51:19 +08:00

316 lines
8.6 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# Docker服务监控脚本
# 持续监控服务状态并在需要时采取行动
set -e
# 配置
MONITOR_INTERVAL=${MONITOR_INTERVAL:-60} # 监控间隔(秒)
MAX_RESTART_ATTEMPTS=${MAX_RESTART_ATTEMPTS:-3} # 最大重启尝试次数
RESTART_COOLDOWN=${RESTART_COOLDOWN:-300} # 重启冷却时间(秒)
LOG_FILE=${LOG_FILE:-"./storage/logs/monitor.log"}
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 创建日志目录
mkdir -p "$(dirname "$LOG_FILE")"
# 日志函数
log_with_timestamp() {
local level=$1
local message=$2
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo -e "${timestamp} [${level}] ${message}" | tee -a "$LOG_FILE"
}
log_info() {
log_with_timestamp "INFO" "${BLUE}$1${NC}"
}
log_success() {
log_with_timestamp "SUCCESS" "${GREEN}$1${NC}"
}
log_warning() {
log_with_timestamp "WARNING" "${YELLOW}$1${NC}"
}
log_error() {
log_with_timestamp "ERROR" "${RED}$1${NC}"
}
# 重启计数器文件
RESTART_COUNTER_DIR="./storage/logs/restart_counters"
mkdir -p "$RESTART_COUNTER_DIR"
# 获取容器重启次数
get_restart_count() {
local container_name=$1
local counter_file="$RESTART_COUNTER_DIR/${container_name}.count"
if [ -f "$counter_file" ]; then
cat "$counter_file"
else
echo "0"
fi
}
# 增加重启次数
increment_restart_count() {
local container_name=$1
local counter_file="$RESTART_COUNTER_DIR/${container_name}.count"
local current_count=$(get_restart_count "$container_name")
local new_count=$((current_count + 1))
echo "$new_count" > "$counter_file"
echo "$new_count"
}
# 重置重启次数
reset_restart_count() {
local container_name=$1
local counter_file="$RESTART_COUNTER_DIR/${container_name}.count"
echo "0" > "$counter_file"
}
# 检查容器是否需要重启
should_restart_container() {
local container_name=$1
local restart_count=$(get_restart_count "$container_name")
if [ "$restart_count" -ge "$MAX_RESTART_ATTEMPTS" ]; then
return 1 # 不应该重启
else
return 0 # 可以重启
fi
}
# 检查容器健康状态
check_container_health() {
local container_name=$1
local service_name=$2
# 检查容器是否运行
if ! docker ps --format "table {{.Names}}" | grep -q "^${container_name}$"; then
log_error "${service_name}容器未运行"
return 1
fi
# 检查容器健康状态
local health_status=$(docker inspect --format='{{.State.Health.Status}}' ${container_name} 2>/dev/null || echo "no-healthcheck")
case $health_status in
"healthy")
# 如果容器健康,重置重启计数器
reset_restart_count "$container_name"
return 0
;;
"unhealthy")
log_error "${service_name}容器健康检查失败"
return 1
;;
"starting")
log_warning "${service_name}容器正在启动中..."
return 2
;;
"no-healthcheck")
# 对于没有健康检查的容器,检查是否正在运行
local container_status=$(docker inspect --format='{{.State.Status}}' ${container_name} 2>/dev/null || echo "unknown")
if [ "$container_status" = "running" ]; then
reset_restart_count "$container_name"
return 0
else
log_error "${service_name}容器状态异常: ${container_status}"
return 1
fi
;;
*)
log_warning "${service_name}容器健康状态未知: ${health_status}"
return 2
;;
esac
}
# 重启容器
restart_container() {
local container_name=$1
local service_name=$2
if ! should_restart_container "$container_name"; then
log_error "${service_name}容器已达到最大重启次数限制,跳过重启"
return 1
fi
local restart_count=$(increment_restart_count "$container_name")
log_warning "${service_name}容器开始重启 (第${restart_count}次尝试)"
if docker restart "$container_name"; then
log_info "${service_name}容器重启命令执行成功,等待启动..."
sleep 30 # 等待容器启动
return 0
else
log_error "${service_name}容器重启失败"
return 1
fi
}
# 监控单个服务
monitor_service() {
local container_name=$1
local service_name=$2
check_container_health "$container_name" "$service_name"
local health_result=$?
case $health_result in
0)
# 健康
return 0
;;
1)
# 不健康,尝试重启
log_warning "${service_name}服务不健康,尝试重启..."
restart_container "$container_name" "$service_name"
return $?
;;
2)
# 启动中或状态未知,继续监控
return 0
;;
esac
}
# 发送告警通知(可扩展)
send_alert() {
local message=$1
local severity=$2
log_error "告警: $message"
# 这里可以添加更多告警方式,如:
# - 发送邮件
# - 发送到Slack
# - 发送到监控系统
# - 写入系统日志
# 示例:写入系统日志
if command -v logger >/dev/null 2>&1; then
logger -t "docker-monitor" -p user.error "$message"
fi
}
# 主监控循环
main_monitor_loop() {
log_info "Docker服务监控开始监控间隔: ${MONITOR_INTERVAL}"
# 定义要监控的服务
local services=(
"knowledge_base_mysql:MySQL数据库"
"knowledge_base_redis:Redis缓存"
"knowledge_base_meilisearch:Meilisearch搜索"
"knowledge_base_app:Web应用"
"knowledge_base_queue:队列处理器"
)
while true; do
local failed_services=0
local total_services=${#services[@]}
log_info "开始监控检查 (共${total_services}个服务)"
for service in "${services[@]}"; do
IFS=':' read -ra SERVICE_PARTS <<< "$service"
local container_name="${SERVICE_PARTS[0]}"
local service_name="${SERVICE_PARTS[1]}"
if ! monitor_service "$container_name" "$service_name"; then
((failed_services++))
fi
done
if [ $failed_services -gt 0 ]; then
local message="监控检查完成,发现 ${failed_services}/${total_services} 个服务存在问题"
log_warning "$message"
if [ $failed_services -ge $((total_services / 2)) ]; then
send_alert "超过一半的服务出现问题: $message" "critical"
fi
else
log_success "所有服务运行正常"
fi
log_info "等待 ${MONITOR_INTERVAL} 秒后进行下次检查..."
sleep "$MONITOR_INTERVAL"
done
}
# 清理函数
cleanup() {
log_info "监控脚本正在退出..."
exit 0
}
# 设置信号处理
trap cleanup SIGINT SIGTERM
# 显示使用帮助
show_help() {
echo "Docker服务监控脚本"
echo ""
echo "用法: $0 [选项]"
echo ""
echo "选项:"
echo " -i, --interval SECONDS 监控间隔(默认: 60秒"
echo " -r, --max-restarts NUM 最大重启尝试次数(默认: 3次"
echo " -c, --cooldown SECONDS 重启冷却时间(默认: 300秒"
echo " -l, --log-file PATH 日志文件路径(默认: ./storage/logs/monitor.log"
echo " -h, --help 显示此帮助信息"
echo ""
echo "环境变量:"
echo " MONITOR_INTERVAL 监控间隔"
echo " MAX_RESTART_ATTEMPTS 最大重启尝试次数"
echo " RESTART_COOLDOWN 重启冷却时间"
echo " LOG_FILE 日志文件路径"
}
# 解析命令行参数
while [[ $# -gt 0 ]]; do
case $1 in
-i|--interval)
MONITOR_INTERVAL="$2"
shift 2
;;
-r|--max-restarts)
MAX_RESTART_ATTEMPTS="$2"
shift 2
;;
-c|--cooldown)
RESTART_COOLDOWN="$2"
shift 2
;;
-l|--log-file)
LOG_FILE="$2"
shift 2
;;
-h|--help)
show_help
exit 0
;;
*)
echo "未知选项: $1"
show_help
exit 1
;;
esac
done
# 如果脚本被直接执行
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main_monitor_loop
fi