#!/bin/bash # Docker服务监控脚本 # 持续监控服务状态并在需要时采取行动 set -e # 配置 MONITOR_INTERVAL=${MONITOR_INTERVAL:-60} # 监控间隔(秒) MAX_RESTART_ATTEMPTS=${MAX_RESTART_ATTEMPTS:-3} # 最大重启尝试次数 RESTART_COOLDOWN=${RESTART_COOLDOWN:-300} # 重启冷却时间(秒) LOG_FILE=${LOG_FILE:-"./storage/logs/monitor.log"} # 颜色定义 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # 创建日志目录 mkdir -p "$(dirname "$LOG_FILE")" # 日志函数 log_with_timestamp() { local level=$1 local message=$2 local timestamp=$(date '+%Y-%m-%d %H:%M:%S') echo -e "${timestamp} [${level}] ${message}" | tee -a "$LOG_FILE" } log_info() { log_with_timestamp "INFO" "${BLUE}$1${NC}" } log_success() { log_with_timestamp "SUCCESS" "${GREEN}$1${NC}" } log_warning() { log_with_timestamp "WARNING" "${YELLOW}$1${NC}" } log_error() { log_with_timestamp "ERROR" "${RED}$1${NC}" } # 重启计数器文件 RESTART_COUNTER_DIR="./storage/logs/restart_counters" mkdir -p "$RESTART_COUNTER_DIR" # 获取容器重启次数 get_restart_count() { local container_name=$1 local counter_file="$RESTART_COUNTER_DIR/${container_name}.count" if [ -f "$counter_file" ]; then cat "$counter_file" else echo "0" fi } # 增加重启次数 increment_restart_count() { local container_name=$1 local counter_file="$RESTART_COUNTER_DIR/${container_name}.count" local current_count=$(get_restart_count "$container_name") local new_count=$((current_count + 1)) echo "$new_count" > "$counter_file" echo "$new_count" } # 重置重启次数 reset_restart_count() { local container_name=$1 local counter_file="$RESTART_COUNTER_DIR/${container_name}.count" echo "0" > "$counter_file" } # 检查容器是否需要重启 should_restart_container() { local container_name=$1 local restart_count=$(get_restart_count "$container_name") if [ "$restart_count" -ge "$MAX_RESTART_ATTEMPTS" ]; then return 1 # 不应该重启 else return 0 # 可以重启 fi } # 检查容器健康状态 check_container_health() { local container_name=$1 local service_name=$2 # 检查容器是否运行 if ! docker ps --format "table {{.Names}}" | grep -q "^${container_name}$"; then log_error "${service_name}容器未运行" return 1 fi # 检查容器健康状态 local health_status=$(docker inspect --format='{{.State.Health.Status}}' ${container_name} 2>/dev/null || echo "no-healthcheck") case $health_status in "healthy") # 如果容器健康,重置重启计数器 reset_restart_count "$container_name" return 0 ;; "unhealthy") log_error "${service_name}容器健康检查失败" return 1 ;; "starting") log_warning "${service_name}容器正在启动中..." return 2 ;; "no-healthcheck") # 对于没有健康检查的容器,检查是否正在运行 local container_status=$(docker inspect --format='{{.State.Status}}' ${container_name} 2>/dev/null || echo "unknown") if [ "$container_status" = "running" ]; then reset_restart_count "$container_name" return 0 else log_error "${service_name}容器状态异常: ${container_status}" return 1 fi ;; *) log_warning "${service_name}容器健康状态未知: ${health_status}" return 2 ;; esac } # 重启容器 restart_container() { local container_name=$1 local service_name=$2 if ! should_restart_container "$container_name"; then log_error "${service_name}容器已达到最大重启次数限制,跳过重启" return 1 fi local restart_count=$(increment_restart_count "$container_name") log_warning "${service_name}容器开始重启 (第${restart_count}次尝试)" if docker restart "$container_name"; then log_info "${service_name}容器重启命令执行成功,等待启动..." sleep 30 # 等待容器启动 return 0 else log_error "${service_name}容器重启失败" return 1 fi } # 监控单个服务 monitor_service() { local container_name=$1 local service_name=$2 check_container_health "$container_name" "$service_name" local health_result=$? case $health_result in 0) # 健康 return 0 ;; 1) # 不健康,尝试重启 log_warning "${service_name}服务不健康,尝试重启..." restart_container "$container_name" "$service_name" return $? ;; 2) # 启动中或状态未知,继续监控 return 0 ;; esac } # 发送告警通知(可扩展) send_alert() { local message=$1 local severity=$2 log_error "告警: $message" # 这里可以添加更多告警方式,如: # - 发送邮件 # - 发送到Slack # - 发送到监控系统 # - 写入系统日志 # 示例:写入系统日志 if command -v logger >/dev/null 2>&1; then logger -t "docker-monitor" -p user.error "$message" fi } # 主监控循环 main_monitor_loop() { log_info "Docker服务监控开始,监控间隔: ${MONITOR_INTERVAL}秒" # 定义要监控的服务 local services=( "knowledge_base_mysql:MySQL数据库" "knowledge_base_redis:Redis缓存" "knowledge_base_meilisearch:Meilisearch搜索" "knowledge_base_app:Web应用" "knowledge_base_queue:队列处理器" ) while true; do local failed_services=0 local total_services=${#services[@]} log_info "开始监控检查 (共${total_services}个服务)" for service in "${services[@]}"; do IFS=':' read -ra SERVICE_PARTS <<< "$service" local container_name="${SERVICE_PARTS[0]}" local service_name="${SERVICE_PARTS[1]}" if ! monitor_service "$container_name" "$service_name"; then ((failed_services++)) fi done if [ $failed_services -gt 0 ]; then local message="监控检查完成,发现 ${failed_services}/${total_services} 个服务存在问题" log_warning "$message" if [ $failed_services -ge $((total_services / 2)) ]; then send_alert "超过一半的服务出现问题: $message" "critical" fi else log_success "所有服务运行正常" fi log_info "等待 ${MONITOR_INTERVAL} 秒后进行下次检查..." sleep "$MONITOR_INTERVAL" done } # 清理函数 cleanup() { log_info "监控脚本正在退出..." exit 0 } # 设置信号处理 trap cleanup SIGINT SIGTERM # 显示使用帮助 show_help() { echo "Docker服务监控脚本" echo "" echo "用法: $0 [选项]" echo "" echo "选项:" echo " -i, --interval SECONDS 监控间隔(默认: 60秒)" echo " -r, --max-restarts NUM 最大重启尝试次数(默认: 3次)" echo " -c, --cooldown SECONDS 重启冷却时间(默认: 300秒)" echo " -l, --log-file PATH 日志文件路径(默认: ./storage/logs/monitor.log)" echo " -h, --help 显示此帮助信息" echo "" echo "环境变量:" echo " MONITOR_INTERVAL 监控间隔" echo " MAX_RESTART_ATTEMPTS 最大重启尝试次数" echo " RESTART_COOLDOWN 重启冷却时间" echo " LOG_FILE 日志文件路径" } # 解析命令行参数 while [[ $# -gt 0 ]]; do case $1 in -i|--interval) MONITOR_INTERVAL="$2" shift 2 ;; -r|--max-restarts) MAX_RESTART_ATTEMPTS="$2" shift 2 ;; -c|--cooldown) RESTART_COOLDOWN="$2" shift 2 ;; -l|--log-file) LOG_FILE="$2" shift 2 ;; -h|--help) show_help exit 0 ;; *) echo "未知选项: $1" show_help exit 1 ;; esac done # 如果脚本被直接执行 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then main_monitor_loop fi