- 添加 Dockerfile 与多套 docker-compose 配置(开发/生产环境) - 集成 Laravel Octane (Swoole) 提升性能 - 新增健康检查、监控脚本及部署文档 - 新增 Docker 镜像离线导入包(MySQL/Redis/Meilisearch) - 优化文档转换、预览服务及队列任务 - 添加 CreateAdminUser 命令与路由健康检查接口 - 新增 Swoole 队列兼容性测试套件 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
316 lines
8.6 KiB
Bash
Executable File
316 lines
8.6 KiB
Bash
Executable File
#!/bin/bash
|
||
|
||
# Docker服务监控脚本
|
||
# 持续监控服务状态并在需要时采取行动
|
||
|
||
set -e
|
||
|
||
# 配置
|
||
MONITOR_INTERVAL=${MONITOR_INTERVAL:-60} # 监控间隔(秒)
|
||
MAX_RESTART_ATTEMPTS=${MAX_RESTART_ATTEMPTS:-3} # 最大重启尝试次数
|
||
RESTART_COOLDOWN=${RESTART_COOLDOWN:-300} # 重启冷却时间(秒)
|
||
LOG_FILE=${LOG_FILE:-"./storage/logs/monitor.log"}
|
||
|
||
# 颜色定义
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[1;33m'
|
||
BLUE='\033[0;34m'
|
||
NC='\033[0m' # No Color
|
||
|
||
# 创建日志目录
|
||
mkdir -p "$(dirname "$LOG_FILE")"
|
||
|
||
# 日志函数
|
||
log_with_timestamp() {
|
||
local level=$1
|
||
local message=$2
|
||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||
echo -e "${timestamp} [${level}] ${message}" | tee -a "$LOG_FILE"
|
||
}
|
||
|
||
log_info() {
|
||
log_with_timestamp "INFO" "${BLUE}$1${NC}"
|
||
}
|
||
|
||
log_success() {
|
||
log_with_timestamp "SUCCESS" "${GREEN}$1${NC}"
|
||
}
|
||
|
||
log_warning() {
|
||
log_with_timestamp "WARNING" "${YELLOW}$1${NC}"
|
||
}
|
||
|
||
log_error() {
|
||
log_with_timestamp "ERROR" "${RED}$1${NC}"
|
||
}
|
||
|
||
# 重启计数器文件
|
||
RESTART_COUNTER_DIR="./storage/logs/restart_counters"
|
||
mkdir -p "$RESTART_COUNTER_DIR"
|
||
|
||
# 获取容器重启次数
|
||
get_restart_count() {
|
||
local container_name=$1
|
||
local counter_file="$RESTART_COUNTER_DIR/${container_name}.count"
|
||
|
||
if [ -f "$counter_file" ]; then
|
||
cat "$counter_file"
|
||
else
|
||
echo "0"
|
||
fi
|
||
}
|
||
|
||
# 增加重启次数
|
||
increment_restart_count() {
|
||
local container_name=$1
|
||
local counter_file="$RESTART_COUNTER_DIR/${container_name}.count"
|
||
local current_count=$(get_restart_count "$container_name")
|
||
local new_count=$((current_count + 1))
|
||
|
||
echo "$new_count" > "$counter_file"
|
||
echo "$new_count"
|
||
}
|
||
|
||
# 重置重启次数
|
||
reset_restart_count() {
|
||
local container_name=$1
|
||
local counter_file="$RESTART_COUNTER_DIR/${container_name}.count"
|
||
echo "0" > "$counter_file"
|
||
}
|
||
|
||
# 检查容器是否需要重启
|
||
should_restart_container() {
|
||
local container_name=$1
|
||
local restart_count=$(get_restart_count "$container_name")
|
||
|
||
if [ "$restart_count" -ge "$MAX_RESTART_ATTEMPTS" ]; then
|
||
return 1 # 不应该重启
|
||
else
|
||
return 0 # 可以重启
|
||
fi
|
||
}
|
||
|
||
# 检查容器健康状态
|
||
check_container_health() {
|
||
local container_name=$1
|
||
local service_name=$2
|
||
|
||
# 检查容器是否运行
|
||
if ! docker ps --format "table {{.Names}}" | grep -q "^${container_name}$"; then
|
||
log_error "${service_name}容器未运行"
|
||
return 1
|
||
fi
|
||
|
||
# 检查容器健康状态
|
||
local health_status=$(docker inspect --format='{{.State.Health.Status}}' ${container_name} 2>/dev/null || echo "no-healthcheck")
|
||
|
||
case $health_status in
|
||
"healthy")
|
||
# 如果容器健康,重置重启计数器
|
||
reset_restart_count "$container_name"
|
||
return 0
|
||
;;
|
||
"unhealthy")
|
||
log_error "${service_name}容器健康检查失败"
|
||
return 1
|
||
;;
|
||
"starting")
|
||
log_warning "${service_name}容器正在启动中..."
|
||
return 2
|
||
;;
|
||
"no-healthcheck")
|
||
# 对于没有健康检查的容器,检查是否正在运行
|
||
local container_status=$(docker inspect --format='{{.State.Status}}' ${container_name} 2>/dev/null || echo "unknown")
|
||
if [ "$container_status" = "running" ]; then
|
||
reset_restart_count "$container_name"
|
||
return 0
|
||
else
|
||
log_error "${service_name}容器状态异常: ${container_status}"
|
||
return 1
|
||
fi
|
||
;;
|
||
*)
|
||
log_warning "${service_name}容器健康状态未知: ${health_status}"
|
||
return 2
|
||
;;
|
||
esac
|
||
}
|
||
|
||
# 重启容器
|
||
restart_container() {
|
||
local container_name=$1
|
||
local service_name=$2
|
||
|
||
if ! should_restart_container "$container_name"; then
|
||
log_error "${service_name}容器已达到最大重启次数限制,跳过重启"
|
||
return 1
|
||
fi
|
||
|
||
local restart_count=$(increment_restart_count "$container_name")
|
||
log_warning "${service_name}容器开始重启 (第${restart_count}次尝试)"
|
||
|
||
if docker restart "$container_name"; then
|
||
log_info "${service_name}容器重启命令执行成功,等待启动..."
|
||
sleep 30 # 等待容器启动
|
||
return 0
|
||
else
|
||
log_error "${service_name}容器重启失败"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 监控单个服务
|
||
monitor_service() {
|
||
local container_name=$1
|
||
local service_name=$2
|
||
|
||
check_container_health "$container_name" "$service_name"
|
||
local health_result=$?
|
||
|
||
case $health_result in
|
||
0)
|
||
# 健康
|
||
return 0
|
||
;;
|
||
1)
|
||
# 不健康,尝试重启
|
||
log_warning "${service_name}服务不健康,尝试重启..."
|
||
restart_container "$container_name" "$service_name"
|
||
return $?
|
||
;;
|
||
2)
|
||
# 启动中或状态未知,继续监控
|
||
return 0
|
||
;;
|
||
esac
|
||
}
|
||
|
||
# 发送告警通知(可扩展)
|
||
send_alert() {
|
||
local message=$1
|
||
local severity=$2
|
||
|
||
log_error "告警: $message"
|
||
|
||
# 这里可以添加更多告警方式,如:
|
||
# - 发送邮件
|
||
# - 发送到Slack
|
||
# - 发送到监控系统
|
||
# - 写入系统日志
|
||
|
||
# 示例:写入系统日志
|
||
if command -v logger >/dev/null 2>&1; then
|
||
logger -t "docker-monitor" -p user.error "$message"
|
||
fi
|
||
}
|
||
|
||
# 主监控循环
|
||
main_monitor_loop() {
|
||
log_info "Docker服务监控开始,监控间隔: ${MONITOR_INTERVAL}秒"
|
||
|
||
# 定义要监控的服务
|
||
local services=(
|
||
"knowledge_base_mysql:MySQL数据库"
|
||
"knowledge_base_redis:Redis缓存"
|
||
"knowledge_base_meilisearch:Meilisearch搜索"
|
||
"knowledge_base_app:Web应用"
|
||
"knowledge_base_queue:队列处理器"
|
||
)
|
||
|
||
while true; do
|
||
local failed_services=0
|
||
local total_services=${#services[@]}
|
||
|
||
log_info "开始监控检查 (共${total_services}个服务)"
|
||
|
||
for service in "${services[@]}"; do
|
||
IFS=':' read -ra SERVICE_PARTS <<< "$service"
|
||
local container_name="${SERVICE_PARTS[0]}"
|
||
local service_name="${SERVICE_PARTS[1]}"
|
||
|
||
if ! monitor_service "$container_name" "$service_name"; then
|
||
((failed_services++))
|
||
fi
|
||
done
|
||
|
||
if [ $failed_services -gt 0 ]; then
|
||
local message="监控检查完成,发现 ${failed_services}/${total_services} 个服务存在问题"
|
||
log_warning "$message"
|
||
|
||
if [ $failed_services -ge $((total_services / 2)) ]; then
|
||
send_alert "超过一半的服务出现问题: $message" "critical"
|
||
fi
|
||
else
|
||
log_success "所有服务运行正常"
|
||
fi
|
||
|
||
log_info "等待 ${MONITOR_INTERVAL} 秒后进行下次检查..."
|
||
sleep "$MONITOR_INTERVAL"
|
||
done
|
||
}
|
||
|
||
# 清理函数
|
||
cleanup() {
|
||
log_info "监控脚本正在退出..."
|
||
exit 0
|
||
}
|
||
|
||
# 设置信号处理
|
||
trap cleanup SIGINT SIGTERM
|
||
|
||
# 显示使用帮助
|
||
show_help() {
|
||
echo "Docker服务监控脚本"
|
||
echo ""
|
||
echo "用法: $0 [选项]"
|
||
echo ""
|
||
echo "选项:"
|
||
echo " -i, --interval SECONDS 监控间隔(默认: 60秒)"
|
||
echo " -r, --max-restarts NUM 最大重启尝试次数(默认: 3次)"
|
||
echo " -c, --cooldown SECONDS 重启冷却时间(默认: 300秒)"
|
||
echo " -l, --log-file PATH 日志文件路径(默认: ./storage/logs/monitor.log)"
|
||
echo " -h, --help 显示此帮助信息"
|
||
echo ""
|
||
echo "环境变量:"
|
||
echo " MONITOR_INTERVAL 监控间隔"
|
||
echo " MAX_RESTART_ATTEMPTS 最大重启尝试次数"
|
||
echo " RESTART_COOLDOWN 重启冷却时间"
|
||
echo " LOG_FILE 日志文件路径"
|
||
}
|
||
|
||
# 解析命令行参数
|
||
while [[ $# -gt 0 ]]; do
|
||
case $1 in
|
||
-i|--interval)
|
||
MONITOR_INTERVAL="$2"
|
||
shift 2
|
||
;;
|
||
-r|--max-restarts)
|
||
MAX_RESTART_ATTEMPTS="$2"
|
||
shift 2
|
||
;;
|
||
-c|--cooldown)
|
||
RESTART_COOLDOWN="$2"
|
||
shift 2
|
||
;;
|
||
-l|--log-file)
|
||
LOG_FILE="$2"
|
||
shift 2
|
||
;;
|
||
-h|--help)
|
||
show_help
|
||
exit 0
|
||
;;
|
||
*)
|
||
echo "未知选项: $1"
|
||
show_help
|
||
exit 1
|
||
;;
|
||
esac
|
||
done
|
||
|
||
# 如果脚本被直接执行
|
||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||
main_monitor_loop
|
||
fi |