提交 ed81b83b authored 作者: 陈泽健's avatar 陈泽健

remove(AutomatedServiceMonitoring): 删除自动化服务监测脚本

- 移除 AutomatedServiceMonitoring.sh 脚本文件
- 清理相关配置和依赖安装功能
- 删除日志监控、内存监控、MySQL监控等功能模块
- 移除邮件通知和钉钉机器人集成功能
- 清理磁盘空间检测和容器信息收集功能
- 删除Markdown转HTML报告生成功能
上级 35387cfc
......@@ -231,3 +231,22 @@ CONFIG_TIMEOUT=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" CONFIG GE
CONFIG_SAVE=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" CONFIG GET save 2>&1 | grep -v "Warning" | tail -1 | tr -d '\r')
echo "CONFIG_CHECK:MaxClients:$CONFIG_MAXCLIENTS,Timeout:${CONFIG_TIMEOUT}s,Save:$CONFIG_SAVE"
# ========== 补充:集群状态检测 ==========
CLUSTER_INFO=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" CLUSTER INFO 2>&1 | grep -v "Warning")
if [ -n "$CLUSTER_INFO" ]; then
# 解析集群状态
CLUSTER_STATE=$(echo "$CLUSTER_INFO" | grep -oP 'cluster_state:\K\w+' | head -1)
CLUSTER_SLOTS_ASSIGNED=$(echo "$CLUSTER_INFO" | grep -oP 'cluster_slots_assigned:\K\d+' | head -1)
CLUSTER_SLOTS_OK=$(echo "$CLUSTER_INFO" | grep -oP 'cluster_slots_ok:\K\d+' | head -1)
CLUSTER_KNOWN_NODES=$(echo "$CLUSTER_INFO" | grep -oP 'cluster_known_nodes:\K\d+' | head -1)
if [ "$CLUSTER_STATE" = "ok" ]; then
echo "CLUSTER_STATUS:State:OK,Nodes:${CLUSTER_KNOWN_NODES:-1},Slots:${CLUSTER_SLOTS_ASSIGNED:-0}/${CLUSTER_SLOTS_OK:-0}"
else
echo "CLUSTER_STATUS:State:${CLUSTER_STATE},Nodes:${CLUSTER_KNOWN_NODES:-1}"
fi
else
# 非集群模式,检测单机状态
echo "CLUSTER_STATUS:Standalone"
fi
This source diff could not be displayed because it is too large. You can view the blob instead.
# PowerShell脚本修复辅助脚本
# 用于修复check_server_health.ps1的编码和语法问题
$scriptPath = "check_server_health.ps1"
# 读取文件内容
$content = Get-Content $scriptPath -Raw -Encoding UTF8
# 修复所有的问题字符串
$content = $content -replace '检测失败或无数据`n"', '检测失败或无数据`n"'
$content = $content -replace '\$([^)]+)\.Name', '${_}.Name'
$content = $content -replace '\$([^)]+)\.Value', '${_}.Value'
$content = $content -replace '\$([^)]+)\.Status', '${_}.Status'
$content = $content -replace '\$([^)]+)\.Threshold', '${_}.Threshold'
$content = $content -replace '\$([^)]+)\.Message', '${_}.Message'
# 修复特定变量引用问题
$content = $content -replace '容器\$([^}]+)', '容器${1}'
$content = $content -replace '容器\$([a-zA-Z_]+)', '容器${1}'
# 保存为UTF-8 with BOM
$utf8 = New-Object System.Text.UTF8Encoding $true
[System.IO.File]::WriteAllText((Resolve-Path $scriptPath).Path, $content, $utf8)
Write-Host "脚本修复完成!" -ForegroundColor Green
Write-Host "文件已使用UTF-8 with BOM编码保存" -ForegroundColor Green
# -*- coding: utf-8 -*-
"""修复PowerShell Here-String中的换行符问题"""
file_path = r'E:\GithubData\ubains-module-test\AuxiliaryTool\ScriptTool\新服务自检\check_server_health.ps1'
with open(file_path, 'r', encoding='utf-8-sig') as f:
content = f.read()
# 在Here-String中,需要使用真正的换行符而不是 `n
# 查找Here-String的范围
here_string_start = content.find('$reportContent = @"')
here_string_end = content.rfind('"@')
if here_string_start != -1 and here_string_end != -1:
before = content[:here_string_start]
here_string = content[here_string_start:here_string_end]
after = content[here_string_end:]
# 替换Here-String中的 "检测失败或无数据`n" 为真正换行的格式
# 原来的格式: "- 检测失败或无数据`n"
# 修复为: "- 检测失败或无数据
# "
old_text = '"- 检测失败或无数据`n"'
new_text = '"- 检测失败或无数据\n"'
here_string = here_string.replace(old_text, new_text)
content = before + here_string + after
with open(file_path, 'w', encoding='utf-8-sig') as f:
f.write(content)
print('Here-String换行符修复完成')
else:
print('未找到Here-String边界')
#!/bin/bash
# MySQL深度检测脚本
# 使用方法: ./mysql_depth_check.sh
MYSQL_PASSWORD="dNrprU&2S"
CONTAINER="umysql"
# 运行时间
UPTIME=$(docker exec $CONTAINER mysql -uroot -p"$MYSQL_PASSWORD" -e "SHOW STATUS LIKE 'Uptime';" 2>&1 | tail -1 | awk '{print $2}')
if [ -n "$UPTIME" ] && [ "$UPTIME" -gt 0 ]; then
UPTIME_DAYS=$((UPTIME / 86400))
echo "UPTIME_DAYS:$UPTIME_DAYS"
else
echo "UPTIME_DAYS:N/A"
fi
# 连接数
THREADS_CONNECTED=$(docker exec $CONTAINER mysql -uroot -p"$MYSQL_PASSWORD" -e "SHOW STATUS LIKE 'Threads_connected';" 2>&1 | tail -1 | awk '{print $2}')
MAX_CONNECTIONS=$(docker exec $CONTAINER mysql -uroot -p"$MYSQL_PASSWORD" -e "SHOW VARIABLES LIKE 'max_connections';" 2>&1 | tail -1 | awk '{print $2}')
if [ -n "$THREADS_CONNECTED" ] && [ -n "$MAX_CONNECTIONS" ] && [ "$MAX_CONNECTIONS" -gt 0 ]; then
CONN_PERCENT=$(awk "BEGIN {printf \"%.1f\", $THREADS_CONNECTED*100/$MAX_CONNECTIONS}")
echo "CONNECTIONS:$THREADS_CONNECTED/$MAX_CONNECTIONS/$CONN_PERCENT"
else
echo "CONNECTIONS:N/A"
fi
# 慢查询
SLOW_QUERIES=$(docker exec $CONTAINER mysql -uroot -p"$MYSQL_PASSWORD" -e "SHOW GLOBAL STATUS LIKE 'Slow_queries';" 2>&1 | tail -1 | awk '{print $2}')
if [ -n "$SLOW_QUERIES" ]; then
echo "SLOW_QUERIES:$SLOW_QUERIES"
else
echo "SLOW_QUERIES:0"
fi
# QPS统计
QUESTIONS=$(docker exec $CONTAINER mysql -uroot -p"$MYSQL_PASSWORD" -e "SHOW GLOBAL STATUS LIKE 'Questions';" 2>&1 | tail -1 | awk '{print $2}')
UPTIME=$(docker exec $CONTAINER mysql -uroot -p"$MYSQL_PASSWORD" -e "SHOW STATUS LIKE 'Uptime';" 2>&1 | tail -1 | awk '{print $2}')
if [ -n "$QUESTIONS" ] && [ -n "$UPTIME" ] && [ "$UPTIME" -gt 0 ]; then
QPS=$(awk "BEGIN {printf \"%.2f\", $QUESTIONS/$UPTIME}")
echo "QPS:$QPS"
else
echo "QPS:N/A"
fi
# TPS统计
COM_COMMIT=$(docker exec $CONTAINER mysql -uroot -p"$MYSQL_PASSWORD" -e "SHOW GLOBAL STATUS LIKE 'Com_commit';" 2>&1 | tail -1 | awk '{print $2}')
COM_ROLLBACK=$(docker exec $CONTAINER mysql -uroot -p"$MYSQL_PASSWORD" -e "SHOW GLOBAL STATUS LIKE 'Com_rollback';" 2>&1 | tail -1 | awk '{print $2}')
if [ -n "$COM_COMMIT" ] && [ -n "$COM_ROLLBACK" ] && [ -n "$UPTIME" ] && [ "$UPTIME" -gt 0 ]; then
TOTAL_TRANS=$((COM_COMMIT + COM_ROLLBACK))
TPS=$(awk "BEGIN {printf \"%.2f\", $TOTAL_TRANS/$UPTIME}")
echo "TPS:$TPS"
else
echo "TPS:N/A"
fi
# 死锁检测
DEADLOCK_OUTPUT=$(docker exec $CONTAINER mysql -uroot -p"$MYSQL_PASSWORD" -e "SHOW GLOBAL STATUS LIKE 'Innodb_deadlocks';" 2>&1 | grep -v Warning)
if [ -n "$DEADLOCK_OUTPUT" ] && [ "$DEADLOCK_OUTPUT" != "" ]; then
DEADLOCKS=$(echo "$DEADLOCK_OUTPUT" | awk 'NF>=2 {print $2}')
if [ -z "$DEADLOCKS" ]; then
DEADLOCKS=0
fi
else
DEADLOCKS=0
fi
echo "DEADLOCKS:$DEADLOCKS"
# Buffer Pool命中率
BUFFER_POOL_RESULT=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -e "SELECT (1 - (SELECT variable_value FROM performance_schema.global_status WHERE variable_name = 'Innodb_buffer_pool_reads') / (SELECT variable_value FROM performance_schema.global_status WHERE variable_name = 'Innodb_buffer_pool_read_requests')) * 100 AS hit_ratio;" 2>&1 | grep -v Warning | tail -1)
if [ -n "$BUFFER_POOL_RESULT" ] && [[ "$BUFFER_POOL_RESULT" =~ ^[0-9]+\.?[0-9]*$ ]]; then
HIT_RATE=$(awk "BEGIN {printf \"%.2f\", $BUFFER_POOL_RESULT}")
echo "BUFFER_POOL_HIT_RATE:$HIT_RATE"
else
echo "BUFFER_POOL_HIT_RATE:N/A"
fi
# 表缓存命中率
OPEN_TABLES=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SHOW GLOBAL STATUS LIKE 'Open_tables';" 2>/dev/null | awk '{print $2}')
OPENED_TABLES=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SHOW GLOBAL STATUS LIKE 'Opened_tables';" 2>/dev/null | awk '{print $2}')
# 验证数值
if [[ "$OPEN_TABLES" =~ ^[0-9]+$ ]] && [[ "$OPENED_TABLES" =~ ^[0-9]+$ ]]; then
if [ "$OPENED_TABLES" -eq 0 ]; then
# 没有重新打开过表,缓存命中100%
echo "CACHE_HIT_RATE:100.00"
elif [ "$OPEN_TABLES" -gt 0 ]; then
# 表缓存命中率 = Open_tables / (Open_tables + Opened_tables) × 100
# Opened_tables表示需要重新打开表的次数(缓存未命中)
TOTAL_OPENS=$((OPEN_TABLES + OPENED_TABLES))
CACHE_HIT_RATE=$(awk "BEGIN {printf \"%.2f\", ($OPEN_TABLES * 100) / $TOTAL_OPENS}")
echo "CACHE_HIT_RATE:$CACHE_HIT_RATE"
else
echo "CACHE_HIT_RATE:N/A"
fi
else
echo "CACHE_HIT_RATE:N/A"
fi
# 当前活跃查询数
ACTIVE_QUERIES=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SHOW PROCESSLIST;" 2>/dev/null | grep -v "Sleep" | grep -v "binlog" | wc -l)
echo "ACTIVE_QUERIES:$ACTIVE_QUERIES"
# Binlog状态
BINLOG_STATUS=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SHOW VARIABLES LIKE 'log_bin';" 2>/dev/null | awk '{print $2}')
if [ "$BINLOG_STATUS" = "ON" ]; then
BINLOG_COUNT=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SHOW BINARY LOGS;" 2>/dev/null | wc -l)
echo "BINLOG_STATUS:ON/$BINLOG_COUNT"
else
echo "BINLOG_STATUS:OFF"
fi
# 表碎片检测 (DATA_FREE > 10MB的表数量)
FRAGMENTED_TABLES=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SELECT COUNT(*) FROM information_schema.TABLES WHERE TABLE_SCHEMA NOT IN ('information_schema','performance_schema','mysql','sys') AND DATA_FREE > 10485760;" 2>/dev/null)
if [ -n "$FRAGMENTED_TABLES" ] && [[ "$FRAGMENTED_TABLES" =~ ^[0-9]+$ ]]; then
echo "FRAGMENTED_TABLES:$FRAGMENTED_TABLES"
else
echo "FRAGMENTED_TABLES:0"
fi
# 连接错误统计
CONN_ERRORS=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SHOW STATUS LIKE 'Aborted_connects';" 2>/dev/null | awk '{print $2}')
if [ -n "$CONN_ERRORS" ]; then
echo "CONN_ERRORS:$CONN_ERRORS"
else
echo "CONN_ERRORS:0"
fi
# InnoDB缓冲池大小
INNODB_BP_SIZE=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SHOW VARIABLES LIKE 'innodb_buffer_pool_size';" 2>/dev/null | awk '{print $2}')
if [ -n "$INNODB_BP_SIZE" ] && [ "$INNODB_BP_SIZE" -gt 0 ]; then
# 转换为MB
BP_SIZE_MB=$((INNODB_BP_SIZE / 1024 / 1024))
echo "INNODB_BP_SIZE:${BP_SIZE_MB}MB"
else
echo "INNODB_BP_SIZE:N/A"
fi
# 事务状态统计
TRX_ACTIVE=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SHOW STATUS LIKE 'Innodb_trx_active';" 2>/dev/null | awk '{print $2}')
if [ -n "$TRX_ACTIVE" ]; then
echo "TRX_ACTIVE:$TRX_ACTIVE"
else
echo "TRX_ACTIVE:0"
fi
# 锁等待检测
LOCK_WAITS=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SHOW STATUS LIKE 'Innodb_row_lock_current_waits';" 2>/dev/null | awk '{print $2}')
if [ -n "$LOCK_WAITS" ]; then
echo "LOCK_WAITS:$LOCK_WAITS"
else
echo "LOCK_WAITS:0"
fi
# 连接池分析(Threads_running/Threads_connected)
THREADS_RUNNING=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SHOW STATUS LIKE 'Threads_running';" 2>/dev/null | awk '{print $2}')
THREADS_CONNECTED_NEW=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SHOW STATUS LIKE 'Threads_connected';" 2>/dev/null | awk '{print $2}')
if [ -n "$THREADS_RUNNING" ] && [ -n "$THREADS_CONNECTED_NEW" ] && [ "$THREADS_CONNECTED_NEW" -gt 0 ]; then
IDLE_CONN=$((THREADS_CONNECTED_NEW - THREADS_RUNNING))
echo "THREADS_POOL:$THREADS_RUNNING/$THREADS_CONNECTED_NEW/$IDLE_CONN"
else
echo "THREADS_POOL:N/A"
fi
# InnoDB状态摘要
INNODB_STATUS=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SHOW ENGINE INNODB STATUS\G" 2>/dev/null | grep -E "LATEST DETECTED DEADLOCK|TRANSACTIONS" | wc -l)
echo "INNODB_STATUS:$INNODB_STATUS"
# 临时表使用率
TEMP_TABLES=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SHOW STATUS LIKE 'Created_tmp_disk_tables';" 2>/dev/null | awk '{print $2}')
TEMP_TABLES_TOTAL=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SHOW STATUS LIKE 'Created_tmp_tables';" 2>/dev/null | awk '{print $2}')
if [ -n "$TEMP_TABLES" ] && [ -n "$TEMP_TABLES_TOTAL" ] && [ "$TEMP_TABLES_TOTAL" -gt 0 ]; then
TEMP_RATE=$(awk "BEGIN {printf \"%.2f\", ($TEMP_TABLES * 100) / $TEMP_TABLES_TOTAL}")
echo "TEMP_TABLE_RATE:$TEMP_RATE"
else
echo "TEMP_TABLE_RATE:N/A"
fi
# 数据库总大小(MB)
DB_SIZE=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SELECT ROUND(SUM(data_length + index_length) / 1024 / 1024, 2) FROM information_schema.TABLES WHERE TABLE_SCHEMA NOT IN ('information_schema','performance_schema','mysql','sys');" 2>/dev/null)
if [ -n "$DB_SIZE" ]; then
echo "DATABASE_SIZE:${DB_SIZE}MB"
else
echo "DATABASE_SIZE:N/A"
fi
# 表数量统计
TABLE_COUNT=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SELECT COUNT(*) FROM information_schema.TABLES WHERE TABLE_SCHEMA NOT IN ('information_schema','performance_schema','mysql','sys') AND TABLE_TYPE='BASE TABLE';" 2>/dev/null)
if [ -n "$TABLE_COUNT" ]; then
echo "TABLE_COUNT:$TABLE_COUNT"
else
echo "TABLE_COUNT:0"
fi
# 复制状态
SLAVE_STATUS=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SHOW SLAVE STATUS\G" 2>/dev/null | grep "Slave_IO_Running:" | awk '{print $2}')
if [ "$SLAVE_STATUS" = "Yes" ]; then
echo "REPLICATION_STATUS:SLAVE"
else
echo "REPLICATION_STATUS:MASTER"
fi
# Binlog过期时间(天)
BINLOG_EXPIRE=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "SHOW VARIABLES LIKE 'expire_logs_days';" 2>/dev/null | awk '{print $2}')
if [ -n "$BINLOG_EXPIRE" ]; then
echo "BINLOG_EXPIRE:${BINLOG_EXPIRE}days"
else
echo "BINLOG_EXPIRE:N/A"
fi
# ========== 高优先级功能补充 ==========
# 当前活跃查询详情 (SHOW PROCESSLIST)
PROCESSLIST_OUTPUT=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -e "SHOW PROCESSLIST;" 2>/dev/null | grep -v "Command" | head -10)
if [ -n "$PROCESSLIST_OUTPUT" ]; then
# 统计各状态查询数量
SLEEP_COUNT=$(echo "$PROCESSLIST_OUTPUT" | grep -c "Sleep" || echo "0")
QUERY_COUNT=$(echo "$PROCESSLIST_OUTPUT" | grep -v "Sleep" | wc -l)
LONG_QUERY_COUNT=$(echo "$PROCESSLIST_OUTPUT" | awk -F'\t' '{if ($6>5) print}' | wc -l)
echo "ACTIVE_PROCESSLIST:Sleep:${SLEEP_COUNT},Active:${QUERY_COUNT},LongRunning:${LONG_QUERY_COUNT}"
# 输出TOP5耗时查询
LONG_QUERIES=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -e "SELECT ID, USER, HOST, DB, COMMAND, TIME, STATE, LEFT(INFO, 50) AS QUERY FROM information_schema.PROCESSLIST WHERE COMMAND != 'Sleep' AND TIME > 0 ORDER BY TIME DESC LIMIT 5;" 2>/dev/null | grep -v "QUERY")
if [ -n "$LONG_QUERIES" ]; then
echo "LONG_QUERIES_TOP5:$(echo "$LONG_QUERIES" | head -5 | tr '\n' '|' | sed 's/|$//')"
else
echo "LONG_QUERIES_TOP5:N/A"
fi
else
echo "ACTIVE_PROCESSLIST:N/A"
echo "LONG_QUERIES_TOP5:N/A"
fi
# 缺少索引的高耗时查询 (通过performance_schema)
SLOW_QUERY_STATS=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -e "
SELECT
COUNT_STAR as exec_count,
ROUND(SUM_TIMER_WAIT/1000000000, 2) as total_time_sec,
ROUND(AVG_TIMER_WAIT/1000000000, 2) as avg_time_sec,
ROUND(SUM_LOCK_TIME/1000000000, 2) as lock_time_sec,
digest_text as query_sample
FROM performance_schema.events_statements_summary_by_digest
WHERE digest_text IS NOT NULL
AND digest_text NOT LIKE '%performance_schema%'
AND COUNT_STAR > 10
AND SUM_TIMER_WAIT > 1000000000
ORDER BY SUM_TIMER_WAIT DESC
LIMIT 10;
" 2>/dev/null | grep -v "query_sample")
if [ -n "$SLOW_QUERY_STATS" ]; then
# 输出TOP1耗时查询的统计
TOP_QUERY=$(echo "$SLOW_QUERY_STATS" | head -2 | tail -1)
if [ -n "$TOP_QUERY" ]; then
EXEC_COUNT=$(echo "$TOP_QUERY" | awk '{print $1}')
TOTAL_TIME=$(echo "$TOP_QUERY" | awk '{print $2}')
AVG_TIME=$(echo "$TOP_QUERY" | awk '{print $3}')
LOCK_TIME=$(echo "$TOP_QUERY" | awk '{print $4}')
echo "SLOW_QUERY_TOP1:Exec:${EXEC_COUNT},TotalTime:${TOTAL_TIME}s,AvgTime:${AVG_TIME}s,LockTime:${LOCK_TIME}s"
else
echo "SLOW_QUERY_TOP1:N/A"
fi
else
echo "SLOW_QUERY_TOP1:N/A"
fi
# 缺少索引的表检测
MISSING_INDEX_TABLES=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -e "
SELECT
t.table_schema,
t.table_name,
t.table_rows,
ROUND(t.data_length / 1024 / 1024, 2) as data_mb
FROM information_schema.TABLES t
WHERE t.table_schema NOT IN ('information_schema','performance_schema','mysql','sys')
AND NOT EXISTS (
SELECT 1 FROM information_schema.STATISTICS s
WHERE s.table_schema = t.table_schema
AND s.table_name = t.table_name
)
AND t.table_rows > 1000
ORDER BY t.data_length DESC
LIMIT 5;
" 2>/dev/null | grep -v "data_mb")
if [ -n "$MISSING_INDEX_TABLES" ]; then
MISSING_COUNT=$(echo "$MISSING_INDEX_TABLES" | wc -l)
echo "TABLES_WITHOUT_INDEX:$MISSING_COUNT"
else
echo "TABLES_WITHOUT_INDEX:0"
fi
# ========== 中优先级功能补充 ==========
# 数据库列表详细输出
DATABASE_LIST=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "
SELECT
table_schema,
ROUND(sum(data_length + index_length) / 1024 / 1024, 2) as size_mb
FROM information_schema.tables
WHERE table_schema NOT IN ('information_schema','performance_schema','mysql','sys')
GROUP BY table_schema
ORDER BY size_mb DESC
LIMIT 10;
" 2>&1 | grep -v "\[Warning\]")
if [ -n "$DATABASE_LIST" ]; then
DB_COUNT=$(echo "$DATABASE_LIST" | wc -l)
TOTAL_SIZE=$(echo "$DATABASE_LIST" | awk -F'\t' '{sum+=$2} END {printf "%.2f", sum}')
DB_NAMES=$(echo "$DATABASE_LIST" | awk -F'\t' '{print $1}' | tr '\n' ',' | sed 's/,$//')
echo "DATABASE_LIST:Count:${DB_COUNT},TotalSize:${TOTAL_SIZE}MB,Databases:${DB_NAMES}"
else
echo "DATABASE_LIST:N/A"
fi
# InnoDB状态详情 (事务历史)
INNODB_TRX_INFO=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "
SELECT
COUNT(*) as trx_count,
IFNULL(SUM(TIMESTAMPDIFF(SECOND, trx_started, NOW())), 0) as total_trx_time
FROM information_schema.INNODB_TRX;
" 2>/dev/null)
if [ -n "$INNODB_TRX_INFO" ]; then
TRX_COUNT=$(echo "$INNODB_TRX_INFO" | awk -F'\t' '{print $1}')
TOTAL_TRX_TIME=$(echo "$INNODB_TRX_INFO" | awk -F'\t' '{print $2}')
echo "INNODB_TRX_DETAIL:Count:${TRX_COUNT},TotalTime:${TOTAL_TRX_TIME}s"
else
echo "INNODB_TRX_DETAIL:Count:0,TotalTime:0s"
fi
# 表统计 (ubains库TOP20)
UBAINS_TOP_TABLES=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -e "
SELECT
table_name,
table_rows,
round(data_length / 1024 / 1024, 2) as data_mb,
round(index_length / 1024 / 1024, 2) as index_mb,
round((data_length + index_length) / 1024 / 1024, 2) as total_mb
FROM information_schema.TABLES
WHERE TABLE_SCHEMA = 'ubains'
ORDER BY (data_length + index_length) DESC
LIMIT 20;
" 2>/dev/null | grep -v "total_mb")
if [ -n "$UBAINS_TOP_TABLES" ]; then
TABLE_COUNT=$(echo "$UBAINS_TOP_TABLES" | wc -l)
# 获取TOP1表的信息
TOP1_TABLE=$(echo "$UBAINS_TOP_TABLES" | head -2 | tail -1)
if [ -n "$TOP1_TABLE" ]; then
TOP1_NAME=$(echo "$TOP1_TABLE" | awk '{print $1}')
TOP1_SIZE=$(echo "$TOP1_TABLE" | awk '{print $5}')
echo "UBAINS_TABLES_TOP20:Count:${TABLE_COUNT},Top1:${TOP1_NAME}:${TOP1_SIZE}MB"
else
echo "UBAINS_TABLES_TOP20:Count:${TABLE_COUNT}"
fi
else
echo "UBAINS_TABLES_TOP20:N/A"
fi
# 复制状态详情
SLAVE_STATUS_DETAIL=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -e "SHOW SLAVE STATUS\G" 2>/dev/null)
if [ -n "$SLAVE_STATUS_DETAIL" ]; then
# 提取关键复制状态信息
SLAVE_IO=$(echo "$SLAVE_STATUS_DETAIL" | grep "Slave_IO_Running:" | awk '{print $2}')
SLAVE_SQL=$(echo "$SLAVE_STATUS_DETAIL" | grep "Slave_SQL_Running:" | awk '{print $2}')
BEHIND_MASTER=$(echo "$SLAVE_STATUS_DETAIL" | grep "Seconds_Behind_Master:" | awk '{print $2}')
if [ "$SLAVE_IO" = "Yes" ]; then
echo "REPLICATION_DETAIL:IO:$SLAVE_IO,SQL:$SLAVE_SQL,Delay:${BEHIND_MASTER}s"
else
echo "REPLICATION_DETAIL:MASTER"
fi
else
echo "REPLICATION_DETAIL:MASTER"
fi
# 连接错误详细统计 - 该功能依赖的表在当前MySQL版本中不存在,已移除
# 使用基础的Aborted_connects统计即可(CONN_ERRORS变量)
# ========== 补充:InnoDB缓冲池详情 ==========
INNODB_BP_DETAIL=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -e "SHOW ENGINE INNODB STATUS\G" 2>/dev/null | grep -A 20 "Buffer pool hit rate")
if [ -n "$INNODB_BP_DETAIL" ]; then
BP_HIT_RATE=$(echo "$INNODB_BP_DETAIL" | grep -oP 'hit rate \K[\d/]+' | head -1)
if [ -n "$BP_HIT_RATE" ]; then
echo "INNODB_BP_DETAIL:HitRate:${BP_HIT_RATE}"
else
echo "INNODB_BP_DETAIL:N/A"
fi
else
echo "INNODB_BP_DETAIL:N/A"
fi
# ========== 补充:锁信息详情 ==========
LOCK_INFO=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "
SELECT
COUNT(*) as lock_waits,
IFNULL(SUM(TIMESTAMPDIFF(SECOND, r.trx_started, NOW())), 0) as total_wait_time
FROM information_schema.INNODB_LOCK_WAITS w
JOIN information_schema.INNODB_TRX r ON r.trx_id = w.requesting_trx_id;
" 2>/dev/null)
if [ -n "$LOCK_INFO" ]; then
LOCK_COUNT=$(echo "$LOCK_INFO" | awk -F'\t' '{print $1}')
WAIT_TIME=$(echo "$LOCK_INFO" | awk -F'\t' '{print $2}')
echo "LOCK_DETAIL:Waits:${LOCK_COUNT},TotalWait:${WAIT_TIME}s"
else
echo "LOCK_DETAIL:Waits:0,TotalWait:0s"
fi
# ========== 补充:表碎片检测详情 ==========
FRAGMENTED_TABLES_DETAIL=$(docker exec -e MYSQL_PWD="$MYSQL_PASSWORD" $CONTAINER mysql -uroot -N -e "
SELECT
CONCAT(table_schema, '.', table_name) as table_name,
ROUND(data_length / 1024 / 1024, 2) as data_mb,
ROUND(data_free / 1024 / 1024, 2) as fragment_mb
FROM information_schema.TABLES
WHERE TABLE_SCHEMA NOT IN ('information_schema','performance_schema','mysql','sys')
AND DATA_FREE > 10485760
ORDER BY data_free DESC
LIMIT 5;
" 2>/dev/null | grep -v "fragment_mb")
if [ -n "$FRAGMENTED_TABLES_DETAIL" ]; then
FRAG_COUNT=$(echo "$FRAGMENTED_TABLES_DETAIL" | wc -l)
TOP_FRAG=$(echo "$FRAGMENTED_TABLES_DETAIL" | head -1)
if [ -n "$TOP_FRAG" ]; then
TABLE_NAME=$(echo "$TOP_FRAG" | awk '{print $1}')
FRAG_MB=$(echo "$TOP_FRAG" | awk '{print $3}')
echo "FRAGMENTED_DETAIL:Count:${FRAG_COUNT},Top1:${TABLE_NAME}:${FRAG_MB}MB"
else
echo "FRAGMENTED_DETAIL:Count:${FRAG_COUNT}"
fi
else
echo "FRAGMENTED_DETAIL:Count:0"
fi
#!/bin/bash
# Redis深度检测脚本
# 使用方法: ./redis_depth_check.sh
REDIS_PASSWORD="dNrprU&2S"
CONTAINER="uredis"
# 基础信息
REDIS_VERSION=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" INFO server 2>&1 | grep redis_version | cut -d: -f2 | tr -d '\r')
if [ -n "$REDIS_VERSION" ]; then
echo "REDIS_VERSION:$REDIS_VERSION"
else
echo "REDIS_VERSION:N/A"
fi
# 运行时间(天)
UPTIME_DAYS=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" INFO server 2>&1 | grep uptime_in_days | cut -d: -f2 | tr -d '\r')
if [ -n "$UPTIME_DAYS" ]; then
echo "UPTIME_DAYS:$UPTIME_DAYS"
else
echo "UPTIME_DAYS:N/A"
fi
# 键数量
KEY_COUNT=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" DBSIZE 2>&1 | grep -v "Warning")
if [ -n "$KEY_COUNT" ] && [[ "$KEY_COUNT" =~ ^[0-9]+$ ]]; then
echo "KEY_COUNT:$KEY_COUNT"
else
echo "KEY_COUNT:0"
fi
# 内存使用
MEMORY_INFO=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" INFO memory 2>&1 | grep -E "used_memory_human:|mem_fragmentation_ratio:" | tr -d '\r' | tr '\n' '|' | sed 's/|$//')
if [ -n "$MEMORY_INFO" ]; then
echo "MEMORY_INFO:$MEMORY_INFO"
else
echo "MEMORY_INFO:N/A"
fi
# 客户端连接数
CLIENT_COUNT=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" INFO clients 2>&1 | grep connected_clients | cut -d: -f2 | tr -d '\r')
if [ -n "$CLIENT_COUNT" ]; then
echo "CLIENT_COUNT:$CLIENT_COUNT"
else
echo "CLIENT_COUNT:0"
fi
# ========== 高优先级功能补充 ==========
# Keyspace信息(各数据库键和过期键统计)
KEYSPACE_INFO=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" INFO keyspace 2>&1 | grep -v "Warning" | grep "^db" | tr -d '\r')
if [ -n "$KEYSPACE_INFO" ]; then
TOTAL_KEYS=0
TOTAL_EXPIRES=0
while IFS=: read -r db_key info; do
if [[ "$info" =~ keys=([0-9]+) ]]; then
keys=${BASH_REMATCH[1]}
TOTAL_KEYS=$((TOTAL_KEYS + keys))
fi
if [[ "$info" =~ expires=([0-9]+) ]]; then
expires=${BASH_REMATCH[1]}
TOTAL_EXPIRES=$((TOTAL_EXPIRES + expires))
fi
done <<< "$KEYSPACE_INFO"
echo "KEYSPACE_DETAIL:Total:$TOTAL_KEYS,Expires:$TOTAL_EXPIRES"
else
echo "KEYSPACE_DETAIL:N/A"
fi
# 键类型分布采样(采样前100个键)
KEY_TYPE_SAMPLE=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" --scan --count 100 2>&1 | head -100 | xargs -I {} docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" TYPE {} 2>&1 | grep -v "Warning" | sort | uniq -c | awk '{print $2":"$1}' | tr '\n' '|' | sed 's/|$//' | sed 's/ /:/g')
if [ -n "$KEY_TYPE_SAMPLE" ]; then
echo "KEY_TYPE_DISTRIBUTION:$KEY_TYPE_SAMPLE"
else
echo "KEY_TYPE_DISTRIBUTION:N/A"
fi
# ========== 中优先级功能补充 ==========
# 持久化信息
PERSISTENCE_INFO=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" INFO persistence 2>&1 | grep -v "Warning" | grep -E "rdb_last_cow_size:|aof_enabled:|rdb_last_save_time:" | tr -d '\r' | tr '\n' '|' | sed 's/|$//')
if [ -n "$PERSISTENCE_INFO" ]; then
# 解析持久化状态
RDB_COW=$(echo "$PERSISTENCE_INFO" | grep -oP 'rdb_last_cow_size:\K\d+' | head -1)
AOF_ENABLED=$(echo "$PERSISTENCE_INFO" | grep -oP 'aof_enabled:\K\d+' | head -1)
RDB_STATUS="空闲"
if [ "$RDB_COW" -gt 0 ] 2>/dev/null; then
RDB_STATUS="备份中"
fi
AOF_STATUS="未启用"
if [ "$AOF_ENABLED" = "1" ]; then
AOF_STATUS="已启用"
fi
echo "PERSISTENCE_STATUS:RDB:$RDB_STATUS,AOF:$AOF_STATUS"
else
echo "PERSISTENCE_STATUS:N/A"
fi
# 复制信息
REPL_INFO=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" INFO replication 2>&1 | grep -v "Warning" | grep -E "role:|connected_slaves:|master_link_status:" | tr -d '\r' | tr '\n' '|' | sed 's/|$//')
if [ -n "$REPL_INFO" ]; then
ROLE=$(echo "$REPL_INFO" | grep -oP 'role:\K\w+' | head -1)
CONNECTED_SLAVES=$(echo "$REPL_INFO" | grep -oP 'connected_slaves:\K\d+' | head -1)
MASTER_LINK=$(echo "$REPL_INFO" | grep -oP 'master_link_status:\K\w+' | head -1)
if [ "$ROLE" = "master" ]; then
echo "REPLICATION_STATUS:Role:$ROLE,Slaves:${CONNECTED_SLAVES:-0}"
elif [ "$ROLE" = "slave" ]; then
echo "REPLICATION_STATUS:Role:$ROLE,MasterLink:${MASTER_LINK:-unknown}"
else
echo "REPLICATION_STATUS:Role:$ROLE"
fi
else
echo "REPLICATION_STATUS:N/A"
fi
# 慢日志TOP10
SLOW_LOG=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" SLOWLOG GET 10 2>&1 | grep -v "Warning" | head -40)
if [ -n "$SLOW_LOG" ]; then
# SLOWLOG输出格式:每4行一个记录 (ID, timestamp, duration, command)
# 我们需要获取所有记录的第3行(持续时间)
SLOW_COUNT=0
SLOWEST_TIME=0
line_num=0
while IFS= read -r line; do
line_num=$((line_num + 1))
# 每第3行是持续时间
if [ $((line_num % 4)) -eq 3 ]; then
if [[ "$line" =~ ^[0-9]+$ ]]; then
duration=$line
if [ $duration -gt $SLOWEST_TIME ]; then
SLOWEST_TIME=$duration
fi
SLOW_COUNT=$((SLOW_COUNT + 1))
fi
fi
done <<< "$SLOW_LOG"
if [ $SLOW_COUNT -gt 0 ]; then
echo "SLOW_LOG_TOP10:Count:$SLOW_COUNT,Slowest:${SLOWEST_TIME}us"
else
echo "SLOW_LOG_TOP10:Count:0"
fi
else
echo "SLOW_LOG_TOP10:Count:0"
fi
# 命令统计TOP5
CMD_STATS=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" INFO commandstats 2>&1 | grep -v "Warning" | grep "cmdstat_" | tr -d '\r')
if [ -n "$CMD_STATS" ]; then
# 提取命令调用次数TOP5
TOP_COMMANDS=$(echo "$CMD_STATS" | grep -oP 'cmdstat_\K[a-z]+(?=:calls=\d+)' | head -5 | tr '\n' ',' | sed 's/,$//')
TOP_CALLS=$(echo "$CMD_STATS" | grep -oP 'cmdstat_[a-z]+:calls=\K\d+' | head -5 | tr '\n' ',' | sed 's/,$//')
if [ -n "$TOP_COMMANDS" ]; then
# 组合命令和调用次数
CMD_DETAIL=""
IFS=',' read -ra CMDS <<< "$TOP_COMMANDS"
IFS=',' read -ra CALLS <<< "$TOP_CALLS"
for i in "${!CMDS[@]}"; do
if [ -n "$CMD_DETAIL" ]; then
CMD_DETAIL="$CMD_DETAIL,"
fi
CMD_DETAIL="$CMD_DETAIL${CMDS[$i]}:${CALLS[$i]}"
done
echo "COMMAND_STATS_TOP5:$CMD_DETAIL"
else
echo "COMMAND_STATS_TOP5:N/A"
fi
else
echo "COMMAND_STATS_TOP5:N/A"
fi
# 客户端列表摘要
CLIENT_LIST=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" CLIENT LIST 2>&1 | grep -v "Warning" | grep "id=" | tr -d '\r')
if [ -n "$CLIENT_LIST" ]; then
TOTAL_CLIENTS=$(echo "$CLIENT_LIST" | wc -l)
IDLE_CLIENTS=0
BLOCKING_CLIENTS=0
while IFS='=' read -r key value; do
if [[ "$value" =~ idle=([0-9]+) ]]; then
idle_time=${BASH_REMATCH[1]}
if [ "$idle_time" -gt 300 ]; then
IDLE_CLIENTS=$((IDLE_CLIENTS + 1))
fi
fi
if [[ "$value" =~ blocking=1 ]]; then
BLOCKING_CLIENTS=$((BLOCKING_CLIENTS + 1))
fi
done <<< "$CLIENT_LIST"
echo "CLIENT_DETAIL:Total:$TOTAL_CLIENTS,IdleOver5min:$IDLE_CLIENTS,Blocking:$BLOCKING_CLIENTS"
else
echo "CLIENT_DETAIL:N/A"
fi
# 缓存命中率
STATS_INFO=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" INFO stats 2>&1 | grep -v "Warning" | grep -E "keyspace_hits:|keyspace_misses:|rejected_connections:" | tr -d '\r')
if [ -n "$STATS_INFO" ]; then
HITS=$(echo "$STATS_INFO" | grep keyspace_hits | cut -d: -f2)
MISSES=$(echo "$STATS_INFO" | grep keyspace_misses | cut -d: -f2)
REJECTED=$(echo "$STATS_INFO" | grep rejected_connections | cut -d: -f2)
if [ -n "$HITS" ] && [ -n "$MISSES" ]; then
TOTAL_REQ=$((HITS + MISSES))
if [ "$TOTAL_REQ" -gt 0 ]; then
HIT_RATE=$(awk "BEGIN {printf \"%.2f\", ($HITS * 100) / $TOTAL_REQ}")
else
HIT_RATE="0.00"
fi
echo "CACHE_HIT_RATE:Hits:$HITS,Misses:$MISSES,Rate:$HIT_RATE%"
else
echo "CACHE_HIT_RATE:N/A"
fi
if [ -n "$REJECTED" ]; then
echo "REJECTED_CONNECTIONS:$REJECTED"
fi
else
echo "CACHE_HIT_RATE:N/A"
echo "REJECTED_CONNECTIONS:0"
fi
# 配置检查
CONFIG_MAXCLIENTS=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" CONFIG GET maxclients 2>&1 | grep -v "Warning" | tail -1 | tr -d '\r')
CONFIG_TIMEOUT=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" CONFIG GET timeout 2>&1 | grep -v "Warning" | tail -1 | tr -d '\r')
CONFIG_SAVE=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" CONFIG GET save 2>&1 | grep -v "Warning" | tail -1 | tr -d '\r')
echo "CONFIG_CHECK:MaxClients:$CONFIG_MAXCLIENTS,Timeout:${CONFIG_TIMEOUT}s,Save:$CONFIG_SAVE"
# ========== 补充:集群状态检测 ==========
CLUSTER_INFO=$(docker exec $CONTAINER redis-cli -a "$REDIS_PASSWORD" CLUSTER INFO 2>&1 | grep -v "Warning")
if [ -n "$CLUSTER_INFO" ]; then
# 解析集群状态
CLUSTER_STATE=$(echo "$CLUSTER_INFO" | grep -oP 'cluster_state:\K\w+' | head -1)
CLUSTER_SLOTS_ASSIGNED=$(echo "$CLUSTER_INFO" | grep -oP 'cluster_slots_assigned:\K\d+' | head -1)
CLUSTER_SLOTS_OK=$(echo "$CLUSTER_INFO" | grep -oP 'cluster_slots_ok:\K\d+' | head -1)
CLUSTER_KNOWN_NODES=$(echo "$CLUSTER_INFO" | grep -oP 'cluster_known_nodes:\K\d+' | head -1)
if [ "$CLUSTER_STATE" = "ok" ]; then
echo "CLUSTER_STATUS:State:OK,Nodes:${CLUSTER_KNOWN_NODES:-1},Slots:${CLUSTER_SLOTS_ASSIGNED:-0}/${CLUSTER_SLOTS_OK:-0}"
else
echo "CLUSTER_STATUS:State:${CLUSTER_STATE},Nodes:${CLUSTER_KNOWN_NODES:-1}"
fi
else
# 非集群模式,检测单机状态
echo "CLUSTER_STATUS:Standalone"
fi
# 测试脚本语法
$ErrorActionPreference = "Stop"
try {
# 尝试解析脚本
$scriptPath = Join-Path $PSScriptRoot "check_server_health.ps1"
$content = Get-Content $scriptPath -Raw
Write-Host "正在检查脚本语法..." -ForegroundColor Cyan
# 检查关键修复点
if ($content -match '\$MYSQL_PASSWORD = ''[^'']*''') {
Write-Host "✓ MySQL密码变量语法正确" -ForegroundColor Green
} else {
Write-Host "✗ MySQL密码变量可能有问题" -ForegroundColor Red
}
if ($content -match '\$REDIS_PASSWORD = ''[^'']*''') {
Write-Host "✓ Redis密码变量语法正确" -ForegroundColor Green
} else {
Write-Host "✗ Redis密码变量可能有问题" -ForegroundColor Red
}
Write-Host "`n语法检查完成!" -ForegroundColor Green
}
catch {
Write-Host "语法检查失败: $($_.Exception.Message)" -ForegroundColor Red
}
# filepath: e:\GithubData\自动化\ubains-module-test\AuxiliaryTool\ScriptTool\自动化服务监测\AutomatedServiceMonitoring.sh
#!/usr/bin/env bash
# filepath: e:\GithubData\自动化\ubains-module-test\AuxiliaryTool\ScriptTool\自动化服务监测\AutomatedServiceMonitoring.sh
########################################
# 自动化服务监测 Shell 脚本(单机版,单次执行)
########################################
set -u
#################### 全局配置 ####################
MYSQL_USER="root"
MYSQL_PASSWORD="dNrprU&2S"
LOG_FILE="./AutomatedServiceMonitoring.sh.log"
REPORT_DIR="./monitor_reports"
WORD_REPORT_DIR="./monitor_reports_word"
HOST_NAME="${HOST_NAME_OVERRIDE:-$(hostname 2>/dev/null || echo localhost)}"
# 邮件通知配置
MAIL_TO="czj@huazhaochina.com,pgy@huazhaochina.com,zxb@huazhaochina.com"
MAIL_SUBJECT_PREFIX="自动化服务监测报告"
# ✅ 邮件标题:支持自定义
# 1) 你可以直接改这个默认值
MAIL_SUBJECT_PREFIX="【内部服务器监测】"
# 2) 也可以在运行时通过环境变量覆盖,例如:
# MAIL_SUBJECT_PREFIX="【产线】服务监测报告" ./AutomatedServiceMonitoring.sh
MAIL_SUBJECT_PREFIX="${MAIL_SUBJECT_PREFIX:-自动化服务监测报告}"
# SMTP 服务器配置(QQ 企业邮箱)
MAIL_SMTP_HOST="smtp.exmail.qq.com"
MAIL_SMTP_PORT="465"
MAIL_SMTP_SSL_SCHEME="smtps" # s-m-t-p-s 协议
MAIL_SMTP_USER="czj@huazhaochina.com"
MAIL_SMTP_PASS="Ubains@123"
# 钉钉机器人配置(自定义机器人 -> 安全设置 -> 加签)
# 自定义机器人 -> 安全设置 -> 加签
DINGDING_ACCESS_TOKEN="7fbf40798cad98b1b5db55ff844ba376b1816e80c5777e6f47ae1d9165dacbb4"
DINGDING_SECRET="SEC610498ed6261ae2df1d071d0880aaa70abf5e67efe47f75a809c1f2314e0dbd6"
# 不需要 @ 人就都留空/false
DINGDING_AT_MOBILES=""
DINGDING_AT_ALL="false"
#################### 日志 ####################
log() {
local level="$1"; shift
local ts
ts="$(date '+%Y-%m-%d %H:%M:%S')"
local msg="[$ts] [$level] $*"
echo "$msg"
echo "$msg" >>"$LOG_FILE"
}
#################### 依赖安装 ####################
detect_os_and_pkg_mgr() {
# 简单判断是基于 yum 还是 apt
if command -v yum >/dev/null 2>&1; then
echo "yum"
elif command -v apt-get >/dev/null 2>&1; then
echo "apt"
else
echo "unknown"
fi
}
ensure_mailx_installed() {
if command -v mail >/dev/null 2>&1 || command -v mailx >/dev/null 2>&1; then
log INFO "[依赖检查] mail/mailx 已安装"
return 0
fi
local pkg_mgr
pkg_mgr="$(detect_os_and_pkg_mgr)"
log INFO "[依赖安装] 检测到包管理器:${pkg_mgr}"
if [[ "$pkg_mgr" == "yum" ]]; then
log INFO "[依赖安装] 尝试使用 yum 安装 mailx"
yum install -y mailx >>"$LOG_FILE" 2>&1 || {
log ERROR "[依赖安装] 使用 yum 安装 mailx 失败,请手工检查。"
return 1
}
elif [[ "$pkg_mgr" == "apt" ]]; then
log INFO "[依赖安装] 尝试使用 apt-get 安装 mailutils"
apt-get update >>"$LOG_FILE" 2>&1 || true
apt-get install -y mailutils >>"$LOG_FILE" 2>&1 || {
log ERROR "[依赖安装] 使用 apt-get 安装 mailutils 失败,请手工检查。"
return 1
}
else
log ERROR "[依赖安装] 未识别到可用的包管理器(yum/apt-get),无法自动安装 mailx。"
return 1
fi
if command -v mail >/dev/null 2>&1 || command -v mailx >/dev/null 2>&1; then
log INFO "[依赖安装] mail/mailx 安装完成"
return 0
else
log ERROR "[依赖安装] mail/mailx 安装后仍不可用,请手工检查。"
return 1
fi
}
# ensure_pandoc_installed() {
# if command -v pandoc >/dev/null 2>&1; then
# log INFO "[依赖检查] pandoc 已安装"
# return 0
# fi
# local pkg_mgr
# pkg_mgr="$(detect_os_and_pkg_mgr)"
# log INFO "[依赖安装] 检测到包管理器:${pkg_mgr}"
# if [[ "$pkg_mgr" == "yum" ]]; then
# log INFO "[依赖安装] 使用 yum 安装 pandoc(需要 epel 源)"
# yum install -y epel-release >>"$LOG_FILE" 2>&1 || true
# yum install -y pandoc >>"$LOG_FILE" 2>&1 || {
# log ERROR "[依赖安装] 使用 yum 安装 pandoc 失败,请手工检查。"
# return 1
# }
# elif [[ "$pkg_mgr" == "apt" ]]; then
# log INFO "[依赖安装] 使用 apt-get 安装 pandoc"
# apt-get update >>"$LOG_FILE" 2>&1 || true
# apt-get install -y pandoc >>"$LOG_FILE" 2>&1 || {
# log ERROR "[依赖安装] 使用 apt-get 安装 pandoc 失败,请手工检查。"
# return 1
# }
# else
# log ERROR "[依赖安装] 未识别到可用的包管理器(yum/apt-get),无法自动安装 pandoc。"
# return 1
# fi
# if command -v pandoc >/dev/null 2>&1; then
# log INFO "[依赖安装] pandoc 安装完成"
# return 0
# else
# log ERROR "[依赖安装] pandoc 安装后仍不可用,请手工检查。"
# return 1
# fi
# }
ensure_mailx_smtp_config() {
local cfg_file="/etc/mail.rc"
# 如果已经存在,并且里面已经包含 smtp.exmail.qq.com,可以选择跳过
if [[ -f "$cfg_file" ]] && grep -q "smtp.exmail.qq.com" "$cfg_file" 2>/dev/null; then
log INFO "[邮件配置] 检测到已有 /etc/mail.rc 且包含 smtp.exmail.qq.com,跳过覆盖。"
return 0
fi
log INFO "[邮件配置] 生成 /etc/mail.rc(使用 QQ 企业邮箱 SMTP)"
cat >"$cfg_file" <<EOF
set from="${MAIL_SMTP_USER}"
set smtp="${MAIL_SMTP_SSL_SCHEME}://${MAIL_SMTP_HOST}:${MAIL_SMTP_PORT}"
set smtp-auth=login
set smtp-auth-user="${MAIL_SMTP_USER}"
set smtp-auth-password="${MAIL_SMTP_PASS}"
set ssl-verify=ignore
EOF
# 可选:同步一份到 /etc/nail.rc
cp "$cfg_file" /etc/nail.rc 2>/dev/null || true
log INFO "[邮件配置] /etc/mail.rc 已生成:smtp=${MAIL_SMTP_HOST}:${MAIL_SMTP_PORT} user=${MAIL_SMTP_USER}"
return 0
}
#################### 7. 钉钉发送(纯 Bash + openssl + curl) ####################
# 构造最终的钉钉 Webhook URL(access_token + timestamp + sign)
build_dingtalk_url() {
local access_token="$1"
local secret="$2"
if [[ -z "$access_token" ]]; then
echo ""
return 1
fi
# 毫秒时间戳
local timestamp
timestamp="$(printf '%s000' "$(date +%s)")"
# 如果没启用加签(secret 为空),只拼 access_token + timestamp
if [[ -z "$secret" ]]; then
echo "https://oapi.dingtalk.com/robot/send?access_token=${access_token}&timestamp=${timestamp}"
return 0
fi
# string_to_sign = timestamp + "\n" + secret
local string_to_sign="${timestamp}"$'\n'"${secret}"
# HMAC-SHA256 + base64
local sign_raw
sign_raw="$(printf '%s' "$string_to_sign" \
| openssl dgst -sha256 -hmac "$secret" -binary \
| openssl base64)"
# URL 编码 —— 严格对齐 Python 的 urllib.parse.quote_plus
# 规则:
# 1. 字母/数字/.-_/~ 保留
# 2. 空格 -> +
# 3. 其它字符(包括 +、/、= 等)都转成 %XX
local sign_enc=""
local i ch hex
for ((i=0; i<${#sign_raw}; i++)); do
ch="${sign_raw:$i:1}"
case "$ch" in
[a-zA-Z0-9.~_-])
sign_enc+="$ch"
;;
' ')
sign_enc+="+"
;;
*)
printf -v hex '%%%02X' "'$ch"
sign_enc+="$hex"
;;
esac
done
echo "https://oapi.dingtalk.com/robot/send?access_token=${access_token}&timestamp=${timestamp}&sign=${sign_enc}"
return 0
}
# 发送 text 消息到钉钉
# 使用方式:先构造 DD_TEXT,然后执行:
# DD_TEXT="$dd_text" send_dingtalk_markdown
send_dingtalk_markdown() {
local access_token="${DINGDING_ACCESS_TOKEN:-}"
local secret="${DINGDING_SECRET:-}"
local at_mobiles="${DINGDING_AT_MOBILES:-}"
local at_all="${DINGDING_AT_ALL:-false}"
local content="${DD_TEXT:-}"
if [[ -z "$access_token" ]]; then
log ERROR "[钉钉发送] 未配置 DINGDING_ACCESS_TOKEN,跳过发送。"
return 1
fi
if [[ -z "$content" ]]; then
log ERROR "[钉钉发送] DD_TEXT 为空,跳过发送。"
return 1
fi
local url
url="$(build_dingtalk_url "$access_token" "$secret")" || {
log ERROR "[钉钉发送] 构造 Webhook URL 失败,跳过发送。"
return 1
}
# 组装 @ 信息
local at_json=""
if [[ -n "$at_mobiles" ]]; then
local tmp arr mobile_items m
tmp="$(echo "$at_mobiles" | tr ',' ' ')"
# shellcheck disable=SC2206
arr=($tmp)
for m in "${arr[@]}"; do
[[ -z "$m" ]] && continue
if [[ -n "$mobile_items" ]]; then
mobile_items="${mobile_items},\"${m}\""
else
mobile_items="\"${m}\""
fi
done
at_json="\"at\": {\"atMobiles\": [${mobile_items}], \"isAtAll\": ${at_all} }"
else
at_json="\"at\": {\"isAtAll\": ${at_all} }"
fi
# 对正文做简单 JSON 转义(换行和双引号)
local json_content
json_content="${content//$'\\'/'\\\\'}" # 反斜杠
json_content="${json_content//$'"'/'\"'}" # 双引号
json_content="${json_content//$'\n'/\\n}" # 换行 -> \n
# 构造 text 消息体
local payload
payload="$(cat <<EOF
{
"msgtype": "text",
"text": {
"content": "${json_content}"
},
${at_json}
}
EOF
)"
# 发送 HTTP 请求
local resp
resp="$(curl -sS -H 'Content-Type: application/json;charset=utf-8' -d "$payload" "$url" 2>>"$LOG_FILE" || true)"
if echo "$resp" | grep -q '"errcode":0'; then
log INFO "[钉钉发送] 已成功发送消息。响应:$resp"
return 0
else
log ERROR "[钉钉发送] 发送失败,响应:$resp"
return 1
fi
}
#################### 1. 平台识别 ####################
detect_platform() {
if [ -d "/data/services" ]; then
PLATFORM_TYPE="new"
BASE_PATH="/data/services"
else
PLATFORM_TYPE="legacy"
BASE_PATH="/var/www"
fi
}
#################### 2. 系统识别 ####################
detect_systems() {
HAS_UJAVA=0
HAS_UPYTHON=0
HAS_UPYTHON_VOICE=0
SYSTEMS=()
local names
names="$(docker ps --format '{{.Names}}' 2>/dev/null || true)"
if echo "$names" | grep -q "ujava"; then
HAS_UJAVA=1
SYSTEMS+=("meeting")
fi
if echo "$names" | grep -q "upython"; then
HAS_UPYTHON=1
SYSTEMS+=("ops")
fi
if echo "$names" | grep -q "upython_voice"; then
HAS_UPYTHON_VOICE=1
SYSTEMS+=("transcription")
fi
}
#################### 3.1 日志审计(只保留暴涨,不做 ERROR 分析) ####################
resolve_log_targets() {
LOG_TARGET_SYS=()
LOG_TARGET_PATH=()
local has_meeting=0
for s in "${SYSTEMS[@]}"; do
if [[ "$s" == "meeting" ]]; then
has_meeting=1
fi
done
if [[ $has_meeting -eq 1 ]]; then
if [[ "$PLATFORM_TYPE" == "new" ]]; then
LOG_TARGET_SYS+=("meeting-2.0")
LOG_TARGET_PATH+=("$BASE_PATH/api/java-meeting/java-meeting2.0/logs/ubains-INFO-AND-ERROR.log")
LOG_TARGET_SYS+=("meeting-3.0")
LOG_TARGET_PATH+=("$BASE_PATH/api/java-meeting/java-meeting3.0/logs/ubains-INFO-AND-ERROR.log")
else
LOG_TARGET_SYS+=("meeting-2.0")
LOG_TARGET_PATH+=("/var/www/java/api-java-meeting2.0/logs/ubains-INFO-AND-ERROR.log")
fi
fi
}
declare -A BURST_LAST_TOTAL
declare -A BURST_LAST_TS
declare -A BURST_LAST_RESULT
declare -A BURST_LAST_DESC
make_log_key() {
local sys_name="$1"
local log_path="$2"
echo "${sys_name}|${log_path}"
}
monitor_log_burst_once() {
local sys_name="$1"
local log_path="$2"
local window_seconds=300
local min_lines_threshold=1000
local rate_threshold_per_sec=5
local key
key="$(make_log_key "$sys_name" "$log_path")"
if [ ! -f "$log_path" ]; then
log INFO "[日志暴涨审计] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path 文件不存在"
BURST_LAST_RESULT["$key"]="NO_FILE"
BURST_LAST_DESC["$key"]="日志文件不存在"
return
fi
local total_lines
total_lines="$(wc -l < "$log_path" 2>/dev/null || echo 0)"
total_lines=$(echo "$total_lines" | tr -d ' ')
if ! [[ "$total_lines" =~ ^[0-9]+$ ]]; then
log INFO "[日志暴涨审计] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path 无法获取总行数"
BURST_LAST_RESULT["$key"]="UNKNOWN"
BURST_LAST_DESC["$key"]="无法获取总行数"
return
fi
local now_ts
now_ts="$(date +%s)"
local last_total="${BURST_LAST_TOTAL[$key]-}"
local last_ts="${BURST_LAST_TS[$key]-}"
if [[ -z "${last_total:-}" || -z "${last_ts:-}" ]]; then
BURST_LAST_TOTAL["$key"]="$total_lines"
BURST_LAST_TS["$key"]="$now_ts"
log INFO "[日志暴涨审计] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path 初始化窗口,总行数=$total_lines"
BURST_LAST_RESULT["$key"]="INIT"
BURST_LAST_DESC["$key"]="初始化窗口,总行数=${total_lines}"
return
fi
local elapsed=$(( now_ts - last_ts ))
(( elapsed <= 0 )) && elapsed=1
local delta_lines=$(( total_lines - last_total ))
(( delta_lines < 0 )) && delta_lines=0
BURST_LAST_TOTAL["$key"]="$total_lines"
BURST_LAST_TS["$key"]="$now_ts"
if (( elapsed < window_seconds )); then
log INFO "[日志暴涨审计] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path 累积中:窗口=${elapsed}s 未达 ${window_seconds}s,新增行数=${delta_lines}"
BURST_LAST_RESULT["$key"]="COLLECTING"
BURST_LAST_DESC["$key"]="窗口${elapsed}s/未达${window_seconds}s,新增行数=${delta_lines}"
return
fi
local rate
rate=$(awk -v d="$delta_lines" -v e="$elapsed" 'BEGIN{ if(e<=0){e=1}; printf "%.2f", d/e }')
local start_ts_human end_ts_human
start_ts_human="$(date -d @"$last_ts" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')"
end_ts_human="$(date -d @"$now_ts" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')"
local desc="窗口=${elapsed}s 新增行数=${delta_lines} 速率=${rate}行/秒 时间段=[${start_ts_human} ~ ${end_ts_human}]"
if (( delta_lines >= min_lines_threshold )) || awk -v r="$rate" -v th="$rate_threshold_per_sec" 'BEGIN{exit !(r>=th)}'; then
log WARN "[日志打印暴涨] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path ${desc}"
BURST_LAST_RESULT["$key"]="BURST"
BURST_LAST_DESC["$key"]="$desc"
else
log INFO "[日志暴涨审计] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path ${desc},未发现暴涨"
BURST_LAST_RESULT["$key"]="OK"
BURST_LAST_DESC["$key"]="$desc"
fi
}
#################### 3.2 内存 ####################
MEM_SAMPLES=0
MEM_SUM_USED_MB=0
MEM_PEAK_USED_MB=0
MEM_PEAK_TS=0
MEM_LAST_USED_MB=0
monitor_mem_once() {
local meminfo
meminfo="$(cat /proc/meminfo 2>/dev/null)" || {
log ERROR "[内存监测] $HOST_NAME => 获取 /proc/meminfo 失败"
return
}
local total_kb avail_kb
total_kb="$(echo "$meminfo" | awk '/^MemTotal:/ {print $2}')"
avail_kb="$(echo "$meminfo" | awk '/^MemAvailable:/ {print $2}')"
if [[ -z "$total_kb" || -z "$avail_kb" ]]; then
log ERROR "[内存监测] $HOST_NAME => 解析失败"
return
fi
local total_mb used_mb
total_mb=$(awk -v t="$total_kb" 'BEGIN{printf "%.0f",t/1024}')
used_mb=$(awk -v t="$total_kb" -v a="$avail_kb" 'BEGIN{u=(t-a)/1024; if(u<0)u=0; printf "%.0f",u}')
MEM_LAST_USED_MB="$used_mb"
MEM_SAMPLES=$((MEM_SAMPLES + 1))
MEM_SUM_USED_MB=$((MEM_SUM_USED_MB + used_mb))
if (( used_mb > MEM_PEAK_USED_MB )); then
MEM_PEAK_USED_MB=$used_mb
MEM_PEAK_TS="$(date +%s)"
fi
local avg_used=$(( MEM_SUM_USED_MB / MEM_SAMPLES ))
local peak_human="N/A"
if [[ "$MEM_PEAK_TS" != "0" && -n "$MEM_PEAK_TS" ]]; then
peak_human="$(date -d @"$MEM_PEAK_TS" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')"
fi
log INFO "[内存监测] $HOST_NAME 当前使用=${used_mb}MB 平均=${avg_used}MB 峰值=${MEM_PEAK_USED_MB}MB@$peak_human"
}
#################### 3.3 MySQL ####################
MYSQL_SAMPLES=0
MYSQL_SUM_CONN=0
MYSQL_PEAK_CONN=0
MYSQL_PEAK_TS=0
MYSQL_LAST_TOTAL=""
MYSQL_LAST_TS=""
MYSQL_LAST_CONN=0
MYSQL_LAST_BURST_STATUS="UNKNOWN"
MYSQL_LAST_BURST_DESC=""
find_mysql_container() {
docker ps --format '{{.Names}} {{.Image}}' 2>/dev/null | \
awk '{
name=$1; $1=""; image=$0;
low=tolower(name " " image);
if(index(low,"mysql")>0 || index(low,"mariadb")>0){
print name; exit 0;
}
}'
}
get_mysql_threads_connected_via_container() {
local container="$1"
local auth="-u${MYSQL_USER} -p${MYSQL_PASSWORD}"
local out
out="$(docker exec -i "${container}" mysql -ss ${auth} -e "SHOW STATUS LIKE 'Threads_connected';" 2>/dev/null | tail -n1 | cut -f2)"
if [[ "$out" =~ ^[0-9]+$ ]]; then
echo "$out"
return 0
fi
out="$(docker exec -i "${container}" mysqladmin ${auth} status 2>/dev/null)"
local val
val="$(echo "$out" | grep -oE 'Threads:[[:space:]]*[0-9]+' | awk '{print $2}')"
if [[ "$val" =~ ^[0-9]+$ ]]; then
echo "$val"
return 0
fi
return 1
}
monitor_mysql_once() {
local conn=""
local container
container="$(find_mysql_container || true)"
if [[ -n "$container" ]]; then
conn="$(get_mysql_threads_connected_via_container "$container" || true)"
if ! [[ "$conn" =~ ^[0-9]+$ ]]; then
log ERROR "[MySQL监测] $HOST_NAME => 在容器 $container 内获取连接数失败"
conn=""
fi
else
log INFO "[MySQL监测] $HOST_NAME => 未检测到 MySQL/MariaDB 容器,回退本机命令"
fi
if [[ -z "$conn" ]]; then
local auth="-u${MYSQL_USER} -p${MYSQL_PASSWORD}"
local out
out="$(mysqladmin ${auth} status 2>/dev/null || true)"
conn="$(echo "$out" | grep -oE 'Threads:[[:space:]]*[0-9]+' | awk '{print $2}')"
fi
if [[ -z "$conn" || ! "$conn" =~ ^[0-9]+$ ]]; then
local auth="-u${MYSQL_USER} -p${MYSQL_PASSWORD}"
local out2
out2="$(mysql -ss ${auth} -e "SHOW STATUS LIKE 'Threads_connected';" 2>/dev/null | tail -n1 | cut -f2)"
if [[ "$out2" =~ ^[0-9]+$ ]]; then
conn="$out2"
fi
fi
if [[ -z "$conn" || ! "$conn" =~ ^[0-9]+$ ]]; then
log ERROR "[MySQL监测] $HOST_NAME => 获取连接数失败(请确认 Docker/MySQL 客户端与权限)"
MYSQL_LAST_BURST_STATUS="ERROR"
MYSQL_LAST_BURST_DESC="无法获取连接数"
return
fi
MYSQL_LAST_CONN="$conn"
local now_ts
now_ts="$(date +%s)"
MYSQL_SAMPLES=$((MYSQL_SAMPLES + 1))
MYSQL_SUM_CONN=$((MYSQL_SUM_CONN + conn))
if (( conn > MYSQL_PEAK_CONN )); then
MYSQL_PEAK_CONN=$conn
MYSQL_PEAK_TS="$now_ts"
fi
local avg_conn=$(( MYSQL_SUM_CONN / MYSQL_SAMPLES ))
local peak_human="N/A"
if [[ -n "$MYSQL_PEAK_TS" && "$MYSQL_PEAK_TS" != "0" ]]; then
peak_human="$(date -d @"$MYSQL_PEAK_TS" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')"
fi
log INFO "[MySQL监测] $HOST_NAME 当前连接数=$conn 平均=${avg_conn} 峰值=${MYSQL_PEAK_CONN}@$peak_human"
local window_seconds=300
local min_burst_conn=200
local rate_threshold_per_sec=1
if [[ -z "${MYSQL_LAST_TOTAL:-}" || -z "${MYSQL_LAST_TS:-}" ]]; then
MYSQL_LAST_TOTAL="$conn"
MYSQL_LAST_TS="$now_ts"
MYSQL_LAST_BURST_STATUS="INIT"
MYSQL_LAST_BURST_DESC="初始化窗口,总连接数=${conn}"
return
fi
local elapsed=$(( now_ts - MYSQL_LAST_TS ))
(( elapsed <= 0 )) && elapsed=1
local delta_conn=$(( conn - MYSQL_LAST_TOTAL ))
(( delta_conn < 0 )) && delta_conn=0
MYSQL_LAST_TOTAL="$conn"
MYSQL_LAST_TS="$now_ts"
if (( elapsed >= window_seconds )); then
local rate
rate=$(awk -v d="$delta_conn" -v e="$elapsed" 'BEGIN{ if(e<=0){e=1}; printf "%.2f", d/e }')
local start_ts_human end_ts_human
start_ts_human="$(date -d @"$((now_ts - elapsed))" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')"
end_ts_human="$(date -d @"$now_ts" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')"
local desc="窗口=${elapsed}s 增量=${delta_conn} 速率=${rate}/s 时间段=[${start_ts_human} ~ ${end_ts_human}]"
if (( delta_conn >= min_burst_conn )) || awk -v r="$rate" -v th="$rate_threshold_per_sec" 'BEGIN{exit !(r>=th)}'; then
log WARN "[MySQL连接暴涨] $HOST_NAME ${desc}"
MYSQL_LAST_BURST_STATUS="BURST"
MYSQL_LAST_BURST_DESC="$desc"
else
log INFO "[MySQL暴涨审计] $HOST_NAME ${desc} 未发现暴涨"
MYSQL_LAST_BURST_STATUS="OK"
MYSQL_LAST_BURST_DESC="$desc"
fi
else
MYSQL_LAST_BURST_STATUS="COLLECTING"
MYSQL_LAST_BURST_DESC="窗口${elapsed}s/未达${window_seconds}s,当前连接数=${conn}"
fi
}
#################### 3.5 容器信息检测 ####################
# 运行中容器信息
CONTAINER_RUNNING_LIST=""
# 未运行(包含退出、异常等状态)容器信息
CONTAINER_EXITED_LIST=""
collect_container_info() {
# 如果 docker 不存在或不可用,直接记录信息后返回
if ! command -v docker >/dev/null 2>&1; then
log INFO "[容器检测] 本机未安装 docker 命令,跳过容器信息检测"
CONTAINER_RUNNING_LIST="本机未检测到 docker 命令,无法获取容器信息。"
CONTAINER_EXITED_LIST="本机未检测到 docker 命令,无法获取容器信息。"
return
fi
# 运行中容器
local running
running="$(docker ps --format '{{.ID}}\t{{.Names}}\t{{.Image}}\t{{.Status}}\t{{.CreatedAt}}' 2>/dev/null || true)"
if [[ -z "${running//[[:space:]]/}" ]]; then
CONTAINER_RUNNING_LIST="当前无运行中的容器。"
else
CONTAINER_RUNNING_LIST="$running"
fi
# 未运行容器(包含 Exited、Created 等非 Up 状态)
local exited
exited="$(docker ps -a --filter 'status=exited' --format '{{.ID}}\t{{.Names}}\t{{.Image}}\t{{.Status}}\t{{.CreatedAt}}' 2>/dev/null || true)"
if [[ -z "${exited//[[:space:]]/}" ]]; then
# 如果没有 exited,再补充其它非 running 状态(防止所有都 paused/created 等)
local not_running
not_running="$(docker ps -a --format '{{.ID}}\t{{.Names}}\t{{.Image}}\t{{.Status}}\t{{.CreatedAt}}' | grep -v 'Up ' || true)"
if [[ -z "${not_running//[[:space:]]/}" ]]; then
CONTAINER_EXITED_LIST="当前无未运行的容器(没有 Exited/非 Up 状态的容器)。"
else
CONTAINER_EXITED_LIST="$not_running"
fi
else
CONTAINER_EXITED_LIST="$exited"
fi
log INFO "[容器检测] 已采集容器信息:running=$(echo \"$CONTAINER_RUNNING_LIST\" | wc -l) 行,exited=$(echo \"$CONTAINER_EXITED_LIST\" | wc -l) 行"
}
# 判断 mailx 是否支持用 -a 添加自定义头(s-nail/mailx 常见)
mailx_supports_a_header() {
if ! command -v mailx >/dev/null 2>&1; then
return 1
fi
# 用 -V / -v 尝试识别 s-nail 关键字(不同发行版输出不同,做一个宽松判断)
if mailx -V 2>&1 | grep -qiE 's-nail|heirloom'; then
# s-nail 一般支持 -a header;heirloom 常把 -a 当附件(不可靠)
if mailx -V 2>&1 | grep -qi 's-nail'; then
return 0
fi
return 1
fi
# 兜底:看帮助里是否明确出现 “-a header”
mailx -h 2>&1 | grep -qiE '(\-a[[:space:]]+header|\-a[[:space:]]+.*header)'
}
ensure_sendmail_installed() {
if command -v sendmail >/dev/null 2>&1; then
log INFO "[依赖检查] sendmail 已安装"
return 0
fi
local pkg_mgr
pkg_mgr="$(detect_os_and_pkg_mgr)"
log INFO "[依赖安装] 尝试安装 sendmail/postfix,pkg_mgr=${pkg_mgr}"
if [[ "$pkg_mgr" == "yum" ]]; then
yum install -y postfix >>"$LOG_FILE" 2>&1 || return 1
systemctl enable --now postfix >>"$LOG_FILE" 2>&1 || true
elif [[ "$pkg_mgr" == "apt" ]]; then
apt-get update >>"$LOG_FILE" 2>&1 || true
apt-get install -y postfix >>"$LOG_FILE" 2>&1 || return 1
systemctl enable --now postfix >>"$LOG_FILE" 2>&1 || true
else
log ERROR "[依赖安装] 未识别包管理器,无法自动安装 postfix"
return 1
fi
command -v sendmail >/dev/null 2>&1 && log INFO "[依赖安装] sendmail 安装完成"
}
#################### 3.6 硬盘空间检测 ####################
DISK_SAMPLES=0
DISK_SUM_USED_PCT=0
DISK_PEAK_USED_PCT=0
DISK_PEAK_TS=0
DISK_LAST_USED_PCT=0
# 额外保存:df -P 原始输出文件(用于解析计算)
DISK_DF_RAW_FILE=""
DISK_DF_FILE=""
# ✅ 新增:避免 set -u 下“未绑定变量”
DISK_LAST_DF_TEXT=""
# 取根分区 / 的使用率(最稳定:不依赖 df 输出文件/复杂解析)
get_root_disk_used_pct() {
if ! command -v df >/dev/null 2>&1; then
return 1
fi
# df -P / 的输出一般为两行:表头 + 数据行,Use% 在第5列
df -P / 2>/dev/null | awk 'NR==2 { gsub(/%/,"",$5); print $5 }'
}
# 返回当前磁盘最大使用率(整数),并把 df 文件保存到 DISK_DF_FILE / DISK_DF_RAW_FILE
get_disk_used_pct_max() {
collect_disk_df_to_file || true
# 读展示文件前 15 行用于日志展示
if [[ -n "${DISK_DF_FILE:-}" && -f "${DISK_DF_FILE:-}" ]]; then
DISK_LAST_DF_TEXT="$(head -n 15 "$DISK_DF_FILE" 2>/dev/null || true)"
else
DISK_LAST_DF_TEXT=""
fi
# 用 df -P 文件解析 Use%(第5列)
if [[ -z "${DISK_DF_RAW_FILE:-}" || ! -f "${DISK_DF_RAW_FILE:-}" ]]; then
return 1
fi
local max
max="$(awk '
# 跳过我们自己写的 3 行头(##/PATH/空行),再跳过 df 表头
/^## / { next }
/^PATH=/ { next }
NF==0 { next }
NR==1 { next }
{
u=$5
gsub(/%/,"",u)
if (u ~ /^[0-9]+$/) {
if (u > m) m=u
}
}
END{
if (m=="") exit 1
print m
}' "$DISK_DF_RAW_FILE" 2>/dev/null || true)"
[[ -n "$max" && "$max" =~ ^[0-9]+$ ]] || return 1
echo "$max"
return 0
}
monitor_disk_once() {
local used_pct
used_pct="$(get_root_disk_used_pct || true)"
if [[ -z "$used_pct" || ! "$used_pct" =~ ^[0-9]+$ ]]; then
log ERROR "[硬盘监测] $HOST_NAME => 获取根分区(/)使用率失败(df 不可用或解析失败)"
return
fi
DISK_LAST_USED_PCT="$used_pct"
DISK_SAMPLES=$((DISK_SAMPLES + 1))
DISK_SUM_USED_PCT=$((DISK_SUM_USED_PCT + used_pct))
if (( used_pct > DISK_PEAK_USED_PCT )); then
DISK_PEAK_USED_PCT="$used_pct"
DISK_PEAK_TS="$(date +%s)"
fi
local avg_used_pct=$(( DISK_SUM_USED_PCT / DISK_SAMPLES ))
local peak_human="N/A"
if [[ -n "${DISK_PEAK_TS}" && "${DISK_PEAK_TS}" != "0" ]]; then
peak_human="$(date -d @"$DISK_PEAK_TS" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')"
fi
local warn_threshold=90
if (( used_pct >= warn_threshold )); then
log WARN "[硬盘监测] $HOST_NAME 根分区(/)使用率偏高:当前=${used_pct}% 平均=${avg_used_pct}% 峰值=${DISK_PEAK_USED_PCT}%@${peak_human}"
else
log INFO "[硬盘监测] $HOST_NAME 根分区(/)当前使用率=${used_pct}% 平均=${avg_used_pct}% 峰值=${DISK_PEAK_USED_PCT}%@${peak_human}"
fi
}
# 简单把 Markdown 报告转成 HTML(够用:标题/列表/代码/表格)
md_to_html_simple() {
local report_file="$1"
# 用 awk 处理 Markdown 表格 + 普通行,其它用 sed 做基础替换
# 注意:这是“可读性优先”的轻量转换器,不追求完整 Markdown 语法覆盖。
awk '
function html_escape(s, t) {
t=s
gsub(/&/,"&amp;",t); gsub(/</,"&lt;",t); gsub(/>/,"&gt;",t)
return t
}
function trim(s){ sub(/^[ \t\r\n]+/,"",s); sub(/[ \t\r\n]+$/,"",s); return s }
BEGIN{
in_table=0
print "<html><head><meta charset=\"utf-8\"></head>"
print "<body style=\"font-family:Segoe UI,Microsoft YaHei,Arial,sans-serif;color:#111;\">"
print "<div style=\"max-width:980px;margin:0 auto;\">"
}
# 表格分隔行:| --- | --- |
function is_table_sep(line) {
return (line ~ /^\|[ \t]*[-: ]+[ \t]*\|/)
}
function is_table_row(line) {
return (line ~ /^\|.*\|[ \t]*$/)
}
function start_table(header_line) {
in_table=1
print "<table border=\"1\" cellpadding=\"6\" cellspacing=\"0\" style=\"border-collapse:collapse;margin:8px 0;font-size:13px;\">"
print "<thead><tr>"
split(header_line, a, "|")
for (i=2; i<=length(a)-1; i++){
h=trim(a[i])
print "<th style=\"background:#f3f4f6;\">" html_escape(h) "</th>"
}
print "</tr></thead><tbody>"
}
function table_row(line) {
print "<tr>"
split(line, a, "|")
for (i=2; i<=length(a)-1; i++){
c=trim(a[i])
print "<td>" html_escape(c) "</td>"
}
print "</tr>"
}
function end_table() {
if(in_table){
print "</tbody></table>"
in_table=0
}
}
{
line=$0
# 空行
if (line ~ /^[ \t]*$/) {
end_table()
print "<div style=\"height:8px;\"></div>"
next
}
# 表格处理:header + separator + body
if (!in_table && is_table_row(line)) {
header=line
getline sep
if (is_table_sep(sep)) {
start_table(header)
# 读取 body 行(可能 0..n 行)
while (getline body_line) {
if (!is_table_row(body_line)) {
# 非表格行,回退给后续处理:awk 没回退机制,只能先处理并继续
end_table()
line=body_line
break
}
table_row(body_line)
}
# 这里 line 可能是表格后第一行(非表格),继续走下面普通行逻辑
if (in_table==0 && line !~ /^\|.*\|[ \t]*$/) {
# fallthrough
} else {
next
}
} else {
# 不是标准表格,按普通行处理(sep 行也要输出)
end_table()
print "<div style=\"line-height:1.6;\">" html_escape(header) "</div>"
print "<div style=\"line-height:1.6;\">" html_escape(sep) "</div>"
next
}
}
end_table()
# 标题
if (substr(line,1,2)=="# ") {
print "<h1>" html_escape(substr(line,3)) "</h1>"
next
}
if (substr(line,1,3)=="## ") {
print "<h2>" html_escape(substr(line,4)) "</h2>"
next
}
if (substr(line,1,4)=="### ") {
print "<h3>" html_escape(substr(line,5)) "</h3>"
next
}
# 列表项(简单)
if (line ~ /^- /) {
# 开启列表块(用一个很简化的方式:遇到 - 就开始 UL,直到遇到非 - 行或空行)
print "<ul style=\"margin:6px 0 12px 18px;\">"
do {
item=substr(line,3)
print "<li style=\"line-height:1.6;\">" html_escape(item) "</li>"
if (getline nxt) {
if (nxt ~ /^- /) { line=nxt; continue }
else { line=nxt; break }
} else { line=""; break }
} while (1)
print "</ul>"
if (line=="") next
# 继续处理 line(当前是列表后第一行)
}
# 普通段落
print "<div style=\"line-height:1.6;white-space:pre-wrap;\">" html_escape(line) "</div>"
}
END{
end_table()
print "<hr/>"
print "<div style=\"color:#6b7280;font-size:12px;\">说明:本邮件正文为自动化监测报告的 HTML 展示版;原始 Markdown 报告请查看落盘文件。</div>"
print "</div></body></html>"
}' "$report_file"
}
# ✅ 取本机“主IP”(优先走路由默认出口;取不到再 fallback 到 hostname -I)
get_primary_ip() {
local ip=""
if command -v ip >/dev/null 2>&1; then
ip="$(ip route get 1 2>/dev/null | awk '{for(i=1;i<=NF;i++){if($i=="src"){print $(i+1); exit}}}')"
fi
if [[ -z "$ip" ]] && command -v hostname >/dev/null 2>&1; then
ip="$(hostname -I 2>/dev/null | awk '{print $1}')"
fi
echo "${ip:-unknown-ip}"
}
#################### 6. 邮件发送 ####################
# 尝试发送邮件,把生成的 MD 报告作为正文发送
send_report_mail() {
local report_md="$1"
if [[ ! -f "$report_md" ]]; then
log ERROR "[邮件发送] 报告文件不存在:$report_md"
return 1
fi
if ! command -v mailx >/dev/null 2>&1 && ! command -v mail >/dev/null 2>&1; then
log ERROR "[邮件发送] 未找到 mailx/mail,无法发送邮件"
return 1
fi
# ✅ 标题改成:【内部】+服务器IP+时间
local server_ip
server_ip="$(get_primary_ip)"
local subject="${MAIL_SUBJECT_PREFIX}${server_ip} - $(date '+%Y-%m-%d %H:%M:%S')"
# 收件人:逗号转空格,兼容性更好
local to="${MAIL_TO}"
to="${to//,/ }"
log INFO "[邮件发送] 使用 mailx(SMTP) 发送纯文本报告给:$to"
# 用 mailx 发送纯文本(依赖 /etc/mail.rc 的 smtp 配置)
# 注意:有的系统只有 mail 没有 mailx,因此做二选一
if command -v mailx >/dev/null 2>&1; then
mailx -s "$subject" $to < "$report_md" || {
log ERROR "[邮件发送] mailx(SMTP) 发送失败"
return 1
}
else
mail -s "$subject" $to < "$report_md" || {
log ERROR "[邮件发送] mail(SMTP) 发送失败"
return 1
}
fi
return 0
}
# ==============================
# 3.1 - ERROR 日志监测(ubains-ERROR.log)
# ==============================
ERROR_TAIL_LINES="${ERROR_TAIL_LINES:-5000}" # 扫描最后 N 行
ERROR_GROUP_GAP_SECONDS="${ERROR_GROUP_GAP_SECONDS:-60}" # 相邻错误<=60s 归为同一时间段
ERROR_LOOKBACK_SECONDS="${ERROR_LOOKBACK_SECONDS:-3600}" # 最近 1 小时窗口(PRD)
ERROR_PRINT_MAX_LINES_PER_RANGE="${ERROR_PRINT_MAX_LINES_PER_RANGE:-20}" # 每个时间段最多重点打印多少行
ERROR_MAX_RANGES_IN_REPORT="${ERROR_MAX_RANGES_IN_REPORT:-5}" # 最多展示最近 N 个时间段
# 每个时间段重点打印的最大行数(你之前是 20,可按需求降到 10)
ERROR_PRINT_MAX_LINES_PER_RANGE="${ERROR_PRINT_MAX_LINES_PER_RANGE:-10}" # 每段最多重点打印多少行
# 根据平台类型,返回 meeting 服务目录(不含 logs)
get_meeting_service_dirs() {
local platform="$1" # new|legacy
local dirs=()
if [[ "$platform" == "new" ]]; then
dirs+=("/data/services/api/java-meeting/java-meeting2.0")
dirs+=("/data/services/api/java-meeting/java-meeting3.0")
else
dirs+=("/var/www/java/api-java-meeting2.0")
fi
printf "%s\n" "${dirs[@]}"
}
# 从一行日志中提取时间戳并转 epoch(尽量兼容常见格式)
# 支持:YYYY-MM-DD HH:MM:SS 或 YYYY/MM/DD HH:MM:SS
extract_epoch_from_log_line() {
local line="$1"
local dt
dt="$(echo "$line" | grep -Eo '20[0-9]{2}[-/][0-9]{2}[-/][0-9]{2}[ T][0-9]{2}:[0-9]{2}:[0-9]{2}' | head -n 1)"
[[ -z "$dt" ]] && return 1
# 统一成 date 友好的格式:YYYY-MM-DD HH:MM:SS
dt="${dt//\//-}"
dt="${dt/T/ }"
date -d "$dt" +%s 2>/dev/null
}
# 扫描某个 ubains-ERROR.log,输出聚合后的“时间段 + 示例行”到 stdout(Markdown)
scan_ubains_error_log_to_md() {
local error_log="$1"
local title="$2"
local tail_lines="$3"
[[ -f "$error_log" ]] || return 0
local now_ts from_ts
now_ts="$(date +%s)"
from_ts=$(( now_ts - ERROR_LOOKBACK_SECONDS ))
echo "### ERROR日志:${title}"
echo
echo "- 文件:\`${error_log}\`"
echo "- 扫描范围:最后 ${tail_lines} 行"
echo "- 统计窗口:最近 ${ERROR_LOOKBACK_SECONDS}s($(date -d @"$from_ts" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "$from_ts") ~ $(date -d @"$now_ts" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "$now_ts"))"
echo "- 聚合规则:相邻错误日志时间间隔 ≤ ${ERROR_GROUP_GAP_SECONDS}s 视为同一时间段"
echo "- 输出限制:最多展示最近 ${ERROR_MAX_RANGES_IN_REPORT} 个时间段;每段最多打印 ${ERROR_PRINT_MAX_LINES_PER_RANGE} 行"
echo
local tmp
tmp="$(mktemp)"
# 只收集最近 1 小时内的记录
tail -n "$tail_lines" "$error_log" 2>/dev/null | while IFS= read -r line; do
local epoch
epoch="$(extract_epoch_from_log_line "$line" || true)"
[[ -z "$epoch" ]] && continue
(( epoch < from_ts )) && continue
(( epoch > now_ts )) && continue
printf "%s|%s\n" "$epoch" "$line"
done > "$tmp"
if [[ ! -s "$tmp" ]]; then
echo "- ✅ 最近 1 小时内未发现 ERROR 日志记录(或日志中未解析到时间戳)"
echo
rm -f "$tmp"
return 0
fi
sort -n "$tmp" -o "$tmp"
# 聚合并暂存每个时间段到数组(最后只输出最近 N 段)
local range_start=0 range_end=0 range_count=0 last_ts=0
local range_idx=0
# 用临时文件保存“每个时间段一条记录:start|end|count|sample_file”
local ranges_meta
ranges_meta="$(mktemp)"
: > "$ranges_meta"
local range_lines_tmp
range_lines_tmp="$(mktemp)"
: > "$range_lines_tmp"
local printed=0
while IFS='|' read -r ts line; do
[[ -z "$ts" ]] && continue
if (( range_start == 0 )); then
range_start=$ts
range_end=$ts
last_ts=$ts
range_count=1
printed=0
: > "$range_lines_tmp"
echo "$line" >> "$range_lines_tmp"
printed=1
continue
fi
local gap=$(( ts - last_ts ))
if (( gap <= ERROR_GROUP_GAP_SECONDS )); then
range_end=$ts
last_ts=$ts
range_count=$((range_count + 1))
if (( printed < ERROR_PRINT_MAX_LINES_PER_RANGE )); then
echo "$line" >> "$range_lines_tmp"
printed=$((printed + 1))
fi
else
# flush 一个时间段:把样例保存到独立文件,记录 meta
range_idx=$((range_idx + 1))
local sample_file
sample_file="$(mktemp)"
cp "$range_lines_tmp" "$sample_file"
echo "${range_start}|${range_end}|${range_count}|${sample_file}" >> "$ranges_meta"
# reset
range_start=$ts
range_end=$ts
last_ts=$ts
range_count=1
printed=0
: > "$range_lines_tmp"
echo "$line" >> "$range_lines_tmp"
printed=1
fi
done < "$tmp"
# flush 最后一个时间段
if (( range_start > 0 )); then
range_idx=$((range_idx + 1))
local sample_file
sample_file="$(mktemp)"
cp "$range_lines_tmp" "$sample_file"
echo "${range_start}|${range_end}|${range_count}|${sample_file}" >> "$ranges_meta"
fi
# 总段数 & 只输出最近 N 段(按结束时间排序取最后 N 段)
local total_ranges
total_ranges="$(wc -l < "$ranges_meta" | tr -d ' ')"
echo "- 最近 1 小时内共发现 **${total_ranges}** 个 ERROR 时间段"
echo
# 取最近的 N 段(按 end 时间排序)
local shown=0
while IFS='|' read -r rs re rc sf; do
local start_h end_h
start_h="$(date -d @"$rs" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "$rs")"
end_h="$(date -d @"$re" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "$re")"
shown=$((shown + 1))
echo "- **时间段${shown}**:**${start_h} ~ ${end_h}**(错误条数:${rc})"
echo
echo "````log"
cat "$sf"
echo "````"
echo
done < <(sort -t'|' -k2,2n "$ranges_meta" | tail -n "$ERROR_MAX_RANGES_IN_REPORT")
if (( total_ranges > ERROR_MAX_RANGES_IN_REPORT )); then
echo "> 仅展示最近 ${ERROR_MAX_RANGES_IN_REPORT} 个时间段,其余 $((total_ranges - ERROR_MAX_RANGES_IN_REPORT)) 个时间段已省略(仍计入统计)。"
echo
fi
# cleanup
while IFS='|' read -r _ _ _ sf; do
[[ -f "$sf" ]] && rm -f "$sf"
done < "$ranges_meta"
rm -f "$tmp" "$ranges_meta" "$range_lines_tmp"
}
# 返回 meeting 的“对内日志文件列表”(每行:title|path)
get_meeting_internal_error_logs() {
local platform="$1"
while IFS= read -r service_dir; do
[[ -z "$service_dir" ]] && continue
local title="meeting-对内-$(basename "$service_dir")"
local p="${service_dir}/logs/ubains-ERROR.log"
echo "${title}|${p}"
done < <(get_meeting_service_dirs "$platform")
}
# 返回 meeting 的“对外日志文件列表”(每行:title|path)
# 说明:PRD 未明确对外日志路径,这里采用“同目录优先 + nginx error/access 作为候选”
get_meeting_external_error_logs() {
local platform="$1"
while IFS= read -r service_dir; do
[[ -z "$service_dir" ]] && continue
# 1) 如果对外也写在同目录(很多项目是同一套日志),那这里会和对内重合:允许,但在报告上分组展示更符合 PRD
echo "meeting-对外-$(basename "$service_dir")|${service_dir}/logs/ubains-ERROR.log"
# 2) nginx 常见路径候选(存在就扫)
# 你也可以按实际项目替换/补充
# echo "meeting-对外-nginx-error|/var/log/nginx/error.log"
# echo "meeting-对外-nginx-access|/var/log/nginx/access.log"
done < <(get_meeting_service_dirs "$platform")
}
# 把 ERROR 监测结果写入报告(Markdown)
write_error_log_section_to_report() {
local out_file="$1"
{
echo "## 二、ERROR日志监测(最近1小时,按时间段聚合)"
echo
echo "> 说明:本章节独立于“日志暴涨”判定;用于统计最近 1 小时内 ERROR 日志出现的时间段,并重点打印(每段最多 ${ERROR_PRINT_MAX_LINES_PER_RANGE} 行)。"
echo
} >> "$out_file"
if [[ "${HAS_UJAVA:-0}" -ne 1 ]]; then
echo "- ℹ️ 未识别到会议预定系统(ujava),跳过 ERROR 日志扫描。" >> "$out_file"
echo >> "$out_file"
return 0
fi
# 2.1 对内(meeting)
{
echo "### 2.1 对内日志(meeting 后端)"
echo
} >> "$out_file"
local any_internal=0
while IFS='|' read -r title log_path; do
[[ -z "$log_path" ]] && continue
if [[ -f "$log_path" ]]; then
any_internal=1
scan_ubains_error_log_to_md "$log_path" "$title" "$ERROR_TAIL_LINES" >> "$out_file"
else
{
echo "#### ${title}"
echo
echo "- 文件:\`${log_path}\`"
echo "- ⚠️ 未找到日志文件"
echo
} >> "$out_file"
fi
done < <(get_meeting_internal_error_logs "$PLATFORM_TYPE")
if [[ "$any_internal" -eq 0 ]]; then
echo "- ⚠️ 对内日志未发现可扫描的 ERROR 文件(全部缺失或不可读)。" >> "$out_file"
echo >> "$out_file"
fi
# 2.2 对外(候选:同目录 or nginx)
{
echo "### 2.2 对外日志"
echo
echo
} >> "$out_file"
local any_external=0
# 用去重避免 nginx 路径被重复扫描多次
local seen_tmp
seen_tmp="$(mktemp)"
: > "$seen_tmp"
while IFS='|' read -r title log_path; do
[[ -z "$log_path" ]] && continue
# 去重 key
if grep -Fxq "$log_path" "$seen_tmp"; then
continue
fi
echo "$log_path" >> "$seen_tmp"
if [[ -f "$log_path" ]]; then
any_external=1
# nginx access.log 不一定含 ERROR;scan_ubains_error_log_to_md 会按时间戳过滤并聚合
scan_ubains_error_log_to_md "$log_path" "$title" "$ERROR_TAIL_LINES" >> "$out_file"
else
{
echo "#### ${title}"
echo
echo "- 文件:\`${log_path}\`"
echo "- (未找到,已跳过)"
echo
} >> "$out_file"
fi
done < <(get_meeting_external_error_logs "$PLATFORM_TYPE")
rm -f "$seen_tmp" 2>/dev/null || true
if [[ "$any_external" -eq 0 ]]; then
echo "- ⚠️ 对外日志未发现可扫描的文件(全部缺失或不可读)。" >> "$out_file"
echo >> "$out_file"
fi
}
#################### 5. 报告输出(md) ####################
write_md_report() {
mkdir -p "$REPORT_DIR" 2>/dev/null || true
local ts file_ts report_file
ts="$(date '+%Y-%m-%d %H:%M:%S')"
file_ts="$(date '+%Y%m%d_%H%M%S')"
report_file="${REPORT_DIR}/monitor_report_${HOST_NAME}_${file_ts}.md"
local platform_text
if [[ "$PLATFORM_TYPE" == "new" ]]; then
platform_text="新统一平台 (/data/services)"
else
platform_text="传统平台 (/var/www)"
fi
local systems_text="无"
((${#SYSTEMS[@]} > 0)) && systems_text="${SYSTEMS[*]}"
local mem_avg_used="N/A"
(( MEM_SAMPLES > 0 )) && mem_avg_used=$(( MEM_SUM_USED_MB / MEM_SAMPLES ))
local mem_peak_time="N/A"
if [[ "$MEM_PEAK_TS" != "0" && -n "$MEM_PEAK_TS" ]]; then
mem_peak_time="$(date -d @"$MEM_PEAK_TS" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')"
fi
local mysql_avg_conn="N/A"
(( MYSQL_SAMPLES > 0 )) && mysql_avg_conn=$(( MYSQL_SUM_CONN / MYSQL_SAMPLES ))
local mysql_peak_time="N/A"
if [[ -n "$MYSQL_PEAK_TS" && "$MYSQL_PEAK_TS" != "0" ]]; then
mysql_peak_time="$(date -d @"$MYSQL_PEAK_TS" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')"
fi
# 新增:硬盘统计
local disk_avg_used="N/A"
(( DISK_SAMPLES > 0 )) && disk_avg_used=$(( DISK_SUM_USED_PCT / DISK_SAMPLES ))
local disk_peak_time="N/A"
if [[ -n "${DISK_PEAK_TS:-}" && "${DISK_PEAK_TS:-0}" != "0" ]]; then
disk_peak_time="$(date -d @"$DISK_PEAK_TS" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')"
fi
# 1) 先写“主报告第一部分:暴涨概览”
{
echo "# 自动化服务监测报告"
echo
echo "- 生成时间:${ts}"
echo "- 主机名:${HOST_NAME}"
echo "- 平台类型:${platform_text}"
echo "- 系统识别:ujava=${HAS_UJAVA},upython=${HAS_UPYTHON},upython_voice=${HAS_UPYTHON_VOICE}"
echo "- 系统列表:${systems_text}"
echo
echo "## 一、日志审计概览(仅暴涨情况,不含 ERROR 详情)"
echo
if ((${#LOG_TARGET_SYS[@]} == 0)); then
echo "- 当前未匹配到需要审计的日志目标(例如未识别到会议预定系统 ujava 容器)。"
else
for ((i=0; i<${#LOG_TARGET_SYS[@]}; i++)); do
local sys_name="${LOG_TARGET_SYS[$i]}"
local log_path="${LOG_TARGET_PATH[$i]}"
local key
key="$(make_log_key "$sys_name" "$log_path")"
local burst_status="${BURST_LAST_RESULT[$key]-未采集}"
local burst_desc="${BURST_LAST_DESC[$key]-无}"
echo "### 日志:${sys_name}"
echo
echo "- 日志路径:\`${log_path}\`"
echo "- 日志暴涨状态:**${burst_status}**"
echo "- 日志暴涨详情:${burst_desc}"
echo
done
fi
echo
} > "$report_file"
# 2) 再追加:ERROR 日志监测(独立于暴涨)
write_error_log_section_to_report "$report_file"
# 3) 再追加:其余章节(编号顺延)
{
echo "## 三、内存资源消耗"
echo
echo "- 当前内存使用:${MEM_LAST_USED_MB} MB"
echo "- 内存使用平均值:${mem_avg_used} MB"
echo "- 内存使用峰值:${MEM_PEAK_USED_MB} MB"
echo "- 峰值发生时间:${mem_peak_time}"
echo
echo "## 四、MySQL 连接数监测"
echo
echo "- 当前 MySQL 连接数:${MYSQL_LAST_CONN}"
echo "- MySQL 连接平均值:${mysql_avg_conn}"
echo "- MySQL 连接峰值:${MYSQL_PEAK_CONN}"
echo "- 峰值发生时间:${mysql_peak_time}"
echo "- 最近暴涨判定状态:**${MYSQL_LAST_BURST_STATUS}**"
echo "- 暴涨详情:${MYSQL_LAST_BURST_DESC}"
echo
echo "## 五、硬盘空间检测"
echo
echo "- 当前硬盘使用率(根分区 / ):${DISK_LAST_USED_PCT}%"
echo "- 硬盘使用率平均值:${disk_avg_used}%"
echo "- 硬盘使用率峰值:${DISK_PEAK_USED_PCT}%"
echo "- 峰值发生时间:${disk_peak_time}"
echo
echo "> 说明:为保证稳定性,当前仅采集根分区(/)的使用率,不再采集/输出 df 详细列表。"
echo
echo "## 六、容器信息检测"
echo
echo "### 6.1 运行中的容器"
echo
if [[ -z "${CONTAINER_RUNNING_LIST//[[:space:]]/}" ]]; then
echo "- 未采集到运行中的容器信息。"
else
echo "| 容器ID | 名称 | 镜像 | 状态 | 创建时间 |"
echo "| ------ | ---- | ---- | ---- | -------- |"
echo "$CONTAINER_RUNNING_LIST" | while IFS=$'\t' read -r cid cname cimg cstatus ctime; do
[[ -z "${cid}${cname}${cimg}${cstatus}${ctime}" ]] && continue
echo "| ${cid} | ${cname} | ${cimg} | ${cstatus} | ${ctime} |"
done
fi
echo
echo "### 6.2 未运行的容器(Exited/其它非 Up 状态)"
echo
if [[ -z "${CONTAINER_EXITED_LIST//[[:space:]]/}" ]]; then
echo "- 未采集到未运行的容器信息。"
else
echo "| 容器ID | 名称 | 镜像 | 状态 | 创建时间 |"
echo "| ------ | ---- | ---- | ---- | -------- |"
echo "$CONTAINER_EXITED_LIST" | while IFS=$'\t' read -r cid cname cimg cstatus ctime; do
[[ -z "${cid}${cname}${cimg}${cstatus}${ctime}" ]] && continue
echo "| ${cid} | ${cname} | ${cimg} | ${cstatus} | ${ctime} |"
done
fi
echo
echo "> 说明:本报告由 \`AutomatedServiceMonitoring.sh\` 自动生成,仅反映本次执行时刻的监测结果。"
echo
} >> "$report_file"
echo "$report_file"
}
#################### 主流程:单次执行 ####################
main_run_once() {
log INFO "[启动] 自动化服务监测 Shell 脚本(单次执行模式),目标服务器(本机)=${HOST_NAME}"
# 0. 环境准备:只处理邮件依赖和配置
ensure_mailx_installed || log ERROR "[启动] mailx 安装失败,后续邮件发送可能不可用。"
ensure_mailx_smtp_config || log ERROR "[启动] mailx SMTP 配置生成失败,后续邮件发送可能不可用。"
# 新增:确保 sendmail 可用(用于发送 HTML 邮件)
ensure_sendmail_installed || log WARN "[启动] sendmail 安装失败,将回退纯文本邮件(排版不可优化)。"
detect_platform
detect_systems
local platform_text
if [[ "$PLATFORM_TYPE" == "new" ]]; then
platform_text="新统一平台(/data/services)"
else
platform_text="传统平台(/var/www)"
fi
log INFO "[平台识别] $HOST_NAME => 平台类型=${platform_text} 基路径=${BASE_PATH}"
log INFO "[系统识别] $HOST_NAME => ujava=${HAS_UJAVA} upython=${HAS_UPYTHON} upython_voice=${HAS_UPYTHON_VOICE} 系统=(${SYSTEMS[*]:-})"
log INFO "[心跳] $HOST_NAME: 平台=$([[ "$PLATFORM_TYPE" == "new" ]] && echo || echo ) 基路径=${BASE_PATH} 系统=${SYSTEMS[*]:-}"
resolve_log_targets
for ((i=0; i<${#LOG_TARGET_SYS[@]}; i++)); do
local sys_name="${LOG_TARGET_SYS[$i]}"
local log_path="${LOG_TARGET_PATH[$i]}"
monitor_log_burst_once "$sys_name" "$log_path"
done
monitor_mem_once
monitor_mysql_once
collect_container_info
# 新增:硬盘空间检测(3.6)
monitor_disk_once
# 生成报告,并获取报告文件路径(md)
local report_file
report_file="$(write_md_report)"
# 6. 邮件发送(注意:这里要传 report_file,而不是未定义的 REPORT_MD_FILE)
send_report_mail "$report_file" || {
log ERROR "[主流程] 报告已生成,但邮件发送失败,请检查日志和邮件客户端配置。"
}
# 7. 发送钉钉通知(文本摘要)
# local now_time
# now_time="$(date '+%Y-%m-%d %H:%M:%S')"
# # 汇总整体状态:有暴涨/错误就判为异常
# local overall_status="正常"
# local overall_emoji="✅"
# # 日志暴涨是否异常
# local any_log_bad=0
# for key in "${!BURST_LAST_RESULT[@]}"; do
# case "${BURST_LAST_RESULT[$key]}" in
# BURST|ERROR)
# any_log_bad=1
# break
# ;;
# esac
# done
# # MySQL 是否暴涨/错误
# local mysql_bad=0
# case "$MYSQL_LAST_BURST_STATUS" in
# BURST|ERROR)
# mysql_bad=1
# ;;
# esac
# if (( any_log_bad == 1 || mysql_bad == 1 )); then
# overall_status="异常"
# overall_emoji="❌"
# fi
# local dd_text
# dd_text="【监控】${HOST_NAME} - ${overall_status}${overall_emoji}
# 触发时间:${now_time}
# 监控概览:
# - 日志审计:${BURST_LAST_DESC[*]:-未采集}
# - 内存:当前 ${MEM_LAST_USED_MB} MB,峰值 ${MEM_PEAK_USED_MB} MB
# - MySQL:当前连接数 ${MYSQL_LAST_CONN},峰值 ${MYSQL_PEAK_CONN},暴涨状态:${MYSQL_LAST_BURST_STATUS}
# - 容器:运行中 $(echo \"$CONTAINER_RUNNING_LIST\" | wc -l) 个,未运行 $(echo \"$CONTAINER_EXITED_LIST\" | wc -l) 个
# 系统监控告警通知"
# DD_TEXT="$dd_text" send_dingtalk_markdown || {
# log ERROR "[主流程] 报告已生成,但钉钉发送失败,请检查 DINGDING_ACCESS_TOKEN / SECRET 配置。"
# }
log INFO "[结束] 本次监测已完成,报告已生成并尝试发送邮件与钉钉通知。"
}
#################### 脚本入口 ####################
main_run_once
\ No newline at end of file
# 服务器监测脚本模块化拆分需求文档
**文档版本**: v1.0
**创建日期**: 2026-05-09
**当前脚本**: `AuxiliaryTool/ScriptTool/服务器监测/check_server_health.ps1`
**文档状态**: 待评审
---
## 一、项目背景
### 1.1 当前现状
**当前脚本问题**:
1. **单一文件过大** - 当前主脚本包含约4300+行代码,维护困难
2. **功能耦合严重** - 所有检测逻辑混在一个文件中,难以独立修改和测试
3. **扩展性差** - 新增功能需要修改主脚本,容易引入bug
4. **代码复用率低** - 相似功能重复实现,缺乏通用函数库
5. **PowerShell解析复杂** - 复杂的Linux命令字符串在PowerShell中解析困难
**脚本规模统计**:
- 总行数: 4300+
- 函数数量: 50+
- 检测项数量: 180+
- 检测模块: 9个主要模块
### 1.2 模块化目标
**主要目标**:
1. **提高可维护性** - 将检测逻辑拆分为独立的Shell脚本模块
2. **增强可扩展性** - 新增功能只需添加新的检测模块,无需修改主脚本
3. **提升可测试性** - 每个模块可独立测试,便于验证功能正确性
4. **降低复杂度** - PowerShell主脚本专注于协调和结果处理,Shell脚本专注数据采集
5. **统一接口规范** - 所有模块使用统一的输出格式,便于解析和处理
**非目标**:
- 不改变检测功能本身(功能保持与原脚本一致)
- 不改变报告格式(输出格式保持不变)
- 不改变用户交互方式(仍然使用PowerShell交互)
---
## 二、模块化设计方案
### 2.1 设计原则
#### 2.1.1 职责分离原则
**PowerShell主脚本职责**:
1. 用户交互和输入验证
2. SSH连接管理
3. 上传检测模块到远程服务器
4. 调度检测模块执行(串行/并行)
5. 解析模块返回结果
6. 阈值判断和问题分类
7. 生成Markdown报告
**Shell检测模块职责**:
1. 执行具体的检测命令
2. 采集原始数据
3. 简单的数据处理和计算
4. 按统一格式输出结果
#### 2.1.2 接口规范原则
**统一输出格式**:
```
KEY1:VALUE1
KEY2:VALUE2
KEY3:VALUE3
...
ERROR:错误信息(如有)
```
**命名规范**:
- 模块脚本: `功能简称_模块简称.sh` (如: `cpu_check.sh`)
- 输出键名: 大写字母+下划线 (如: `CPU_USAGE`, `MEMORY_TOTAL`)
- 错误输出: `ERROR:错误描述`
#### 2.1.3 依赖管理原则
**模块依赖关系**:
```
config.sh (配置)
common.sh (通用函数)
各检测模块 (独立调用)
```
**依赖原则**:
- 检测模块之间不相互依赖
- 所有模块只依赖 `config.sh``common.sh`
- 通过主脚本进行模块编排,而非模块间相互调用
### 2.2 目录结构设计
#### 2.2.1 完整目录结构
```
AuxiliaryTool/ScriptTool/服务器监测/
├── check_server_health.ps1 # 主PowerShell脚本(协调器)
├── lib/ # 检测模块库目录
│ ├── config.sh # 配置文件(密码、容器名、阈值等)
│ ├── common.sh # 通用函数库(日志、工具函数等)
│ │
│ ├── system/ # 系统基础检测模块
│ │ ├── 01_system_basic.sh # 系统基础信息
│ │ ├── 02_cpu_check.sh # CPU资源检测
│ │ ├── 03_memory_check.sh # 内存资源检测
│ │ ├── 04_disk_check.sh # 磁盘资源检测
│ │ ├── 05_oom_check.sh # OOM和内核异常检测
│ │ ├── 06_process_check.sh # 进程状态检测
│ │ ├── 07_network_check.sh # 网络连接检测
│ │ ├── 08_security_check.sh # 安全合规检测
│ │ ├── 09_system_logs.sh # 系统日志检测
│ │ ├── 10_time_sync.sh # 时间同步检测
│ │ ├── 11_scheduled_tasks.sh # 定时任务检测
│ │ └── 12_port_check.sh # 端口服务检测
│ │
│ ├── service/ # 服务层检测模块
│ │ ├── 20_docker_basic.sh # Docker基础检测
│ │ ├── 21_docker_deep.sh # Docker深度检测
│ │ ├── 22_mysql_basic.sh # MySQL基础检测
│ │ ├── 23_mysql_depth.sh # MySQL深度检测(已存在,需适配)
│ │ ├── 24_redis_basic.sh # Redis基础检测
│ │ ├── 25_redis_depth.sh # Redis深度检测(已存在,需适配)
│ │ ├── 26_emqx_basic.sh # EMQX基础检测
│ │ ├── 27_emqx_deep.sh # EMQX深度检测
│ │ ├── 28_java_check.sh # Java应用检测
│ │ ├── 29_python_check.sh # Python应用检测
│ │ ├── 30_nginx_check.sh # Nginx应用检测
│ │ ├── 31_nacos_check.sh # Nacos应用检测
│ │ ├── 32_fastdfs_check.sh # FastDFS检测
│ │ └── 33_app_logs.sh # 应用日志分析
│ │
│ └── utils/ # 工具脚本
│ ├── upload.sh # 文件上传工具
│ ├── execute.sh # 批量执行工具
│ └── collect.sh # 结果收集工具
├── bin/ # 可执行工具
│ ├── plink.exe # SSH连接工具
│ └── pscp.exe # 文件传输工具
└── reports/ # 报告输出目录
```
#### 2.2.2 模块编号规则
**编号规则**: `分类编号_序号`
| 分类 | 编号范围 | 说明 |
|:---|:---|:---|
| 配置和通用 | 00-09 | 基础设施 |
| 系统检测 | 10-19 | 系统层面检测 |
| 服务检测 | 20-39 | 服务和应用检测 |
**优势**:
- 编号反映模块类别
- 便于按需加载(如:只检测系统、只检测服务)
- 方便问题定位(知道哪个模块出问题)
### 2.3 Shell脚本规范
#### 2.3.1 脚本头部规范
```bash
#!/bin/bash
################################################################################
# 模块名称: CPU资源检测
# 功能描述: 检测CPU使用率、核心数、负载、中断等信息
# 输出格式: KEY:VALUE(每行一个键值对)
# 作者: 系统自动生成
# 创建日期: 2026-05-09
# 依赖: config.sh, common.sh
# 使用方法: ./02_cpu_check.sh
################################################################################
# 获取脚本目录
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LIB_DIR="$(dirname "$SCRIPT_DIR")/lib"
# 加载依赖
source "$LIB_DIR/config.sh"
source "$LIB_DIR/common.sh"
```
#### 2.3.2 输出格式规范
**标准输出格式**:
```bash
# 成功输出示例
CPU_CORES:8
CPU_USAGE:45.6
CPU_1MIN:2.35
CPU_5MIN:2.12
CPU_15MIN:1.98
PROCESS_TOP5:java 15.3, mysql 8.2, nginx 3.1
INTERRUPT_TOTAL:12345678
SOFTIRQ_TOTAL:2345678
# 错误输出示例
ERROR:无法获取CPU使用率
```
**输出规范**:
1. 每行一个键值对,格式为 `KEY:VALUE`
2. KEY使用大写字母和下划线
3. VALUE不包含换行符
4. 数组类型使用逗号分隔或管道符
5. 错误信息以 `ERROR:` 开头
6. 不输出额外信息(如调试信息、进度信息等)
#### 2.3.3 错误处理规范
```bash
# 错误处理模板
check_command() {
if ! command -v "$1" >/dev/null 2>&1; then
echo "ERROR:命令 $1 不存在"
exit 1
fi
}
# 使用示例
check_command "mpstat"
```
#### 2.3.4 阈值输出规范
```bash
# 阈值信息输出(由Shell脚本判断)
THRESHOLD_ITEM:CPU_USAGE
THRESHOLD_WARNING:85
THRESHOLD_CRITICAL:100
THRESHOLD_CURRENT:45.6
THRESHOLD_STATUS:NORMAL
# 或者不输出阈值信息,由主脚本统一判断
```
### 2.4 PowerShell主脚本设计
#### 2.4.1 主脚本结构
```powershell
################################################################################
# 服务器健康监测脚本 v3.0(模块化版本)
# 功能: 通过SSH连接远程服务器,调用检测模块并生成报告
# 架构: PowerShell协调器 + Shell检测模块
################################################################################
param(
[string]$HostName = "",
[int]$Port = 0,
[string]$Username = "",
[string]$Password = ""
)
# ==================== 全局变量 ====================
$script:检测结果 = @{}
$script:严重问题 = @()
$script:警告问题 = @()
$script:检测时间 = Get-Date
# ==================== 模块配置 ====================
$ModuleConfig = @{
SystemModules = @(
"01_system_basic.sh",
"02_cpu_check.sh",
"03_memory_check.sh",
# ... 更多系统模块
)
ServiceModules = @(
"20_docker_basic.sh",
"22_mysql_basic.sh",
"24_redis_basic.sh",
# ... 更多服务模块
)
ModulePath = "/tmp/check_modules"
}
# ==================== 核心函数 ====================
# 上传模块到远程服务器
function Publish-Modules {
param([string]$LibPath)
Write-Log "上传检测模块到远程服务器..."
# 创建远程目录
Invoke-SSHCommand "mkdir -p $ModulePath/system $ModulePath/service"
# 上传配置和通用函数
Invoke-SSHUpload "$LibPath/config.sh" "$ModulePath/config.sh"
Invoke-SSHUpload "$LibPath/common.sh" "$ModulePath/common.sh"
# 上传检测模块
foreach ($module in $ModuleConfig.SystemModules) {
Invoke-SSHUpload "$LibPath/system/$module" "$ModulePath/system/$module"
}
foreach ($module in $ModuleConfig.ServiceModules) {
Invoke-SSHUpload "$LibPath/service/$module" "$ModulePath/service/$module"
}
}
# 执行检测模块
function Invoke-ModuleCheck {
param(
[string]$ModuleName,
[string]$Category
)
Write-Log "执行模块: $ModuleName"
$remoteScript = "$ModulePath/$Category/$ModuleName"
$result = Invoke-SSHCommand "chmod +x $remoteScript && $remoteScript"
# 解析结果
Parse-ModuleResult $result $ModuleName
}
# 解析模块结果
function Parse-ModuleResult {
param(
[string]$RawOutput,
[string]$ModuleName
)
$results = @()
$lines = $RawOutput -split "`n"
foreach ($line in $lines) {
if ($line -match "^ERROR:(.+)") {
Write-Log "$ModuleName 错误: $($matches[1])" "WARN"
continue
}
if ($line -match "^([^:]+):(.+)$") {
$key = $matches[1].Trim()
$value = $matches[2].Trim()
# 根据KEY生成检测结果对象
$result = Convert-ToResultObject $key $value $ModuleName
if ($result) {
$results += $result
}
}
}
return $results
}
# 主函数
function Main {
# 1. 交互式输入
Invoke-InteractiveInput
# 2. SSH连接测试
Test-SSHConnection
# 3. 上传检测模块
Publish-Modules -LibPath "$PSScriptRoot/lib"
# 4. 执行系统检测
foreach ($module in $ModuleConfig.SystemModules) {
$results = Invoke-ModuleCheck -ModuleName $module -Category "system"
Save-TestResult "系统检测" $results
}
# 5. 执行服务检测
foreach ($module in $ModuleConfig.ServiceModules) {
$results = Invoke-ModuleCheck -ModuleName $module -Category "service"
Save-TestResult "服务检测" $results
}
# 6. 生成报告
$report = New-MarkdownReport
Save-Report $report
Write-Host "检测完成!报告已生成" -ForegroundColor Green
}
```
#### 2.4.2 配置文件设计
**lib/config.sh**:
```bash
#!/bin/bash
################################################################################
# 配置文件
# 说明: 集中管理所有配置信息,避免在脚本中硬编码
################################################################################
# ==================== 容器配置 ====================
declare -A CONTAINERS
CONTAINERS[mysql]="umysql"
CONTAINERS[redis]="uredis"
CONTAINERS[emqx]="uemqx"
CONTAINERS[java]="ujava2"
CONTAINERS[nginx]="unginx"
CONTAINERS[nacos]="unacos"
# ==================== 密码配置 ====================
MYSQL_PASSWORD="dNrprU&2S"
REDIS_PASSWORD="dNrprU&2S"
# ==================== 阈值配置 ====================
# CPU阈值
CPU_WARNING=85
CPU_CRITICAL=100
# 内存阈值
MEMORY_WARNING=85
MEMORY_CRITICAL=95
# 磁盘阈值
DISK_WARNING=90
DISK_CRITICAL=95
# ==================== 路径配置 ====================
# 日志路径
JAVA_LOG_PATH="/data/services/api/*/log"
PYTHON_LOG_PATH="/data/services/api/python*/log"
NGINX_LOG_PATH="/data/middleware/nginx/log"
NACOS_LOG_PATH="/data/middleware/nacos/logs"
# ==================== 函数 ====================
# 获取配置值
get_config() {
local key=$1
echo "${!key}"
}
# 检查容器是否存在
check_container() {
local container=$1
docker ps --format '{{.Names}}' | grep -q "^${container}$"
return $?
}
```
#### 2.4.3 通用函数库设计
**lib/common.sh**:
```bash
#!/bin/bash
################################################################################
# 通用函数库
# 说明: 提供通用的检测函数,供各检测模块调用
################################################################################
# ==================== 日志函数 ====================
log_info() { echo "[INFO] $*"; }
log_error() { echo "[ERROR] $*"; }
log_warn() { echo "[WARN] $*"; }
# ==================== Docker通用函数 ====================
# 检查容器是否运行
is_container_running() {
local container=$1
docker ps --format '{{.Names}}' | grep -q "^${container}$"
return $?
}
# 获取容器IP
get_container_ip() {
local container=$1
docker inspect --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$container" 2>/dev/null
}
# 在容器内执行命令
exec_in_container() {
local container=$1
shift
docker exec "$container" "$@" 2>/dev/null
}
# ==================== 系统信息函数 ====================
# 获取系统运行时间(秒)
get_uptime() {
cat /proc/uptime | awk '{print int($1)}'
}
# 获取系统负载
get_loadavg() {
cat /proc/loadavg | awk '{print $1,$2,$3}'
}
# 检查命令是否存在
require_command() {
local cmd=$1
if ! command -v "$cmd" >/dev/null 2>&1; then
echo "ERROR:需要安装 $cmd 命令"
return 1
fi
return 0
}
# ==================== 数据处理函数 ====================
# 格式化字节大小
format_bytes() {
local bytes=$1
if [ "$bytes" -lt 1024 ]; then
echo "${bytes}B"
elif [ "$bytes" -lt 1048576 ]; then
echo "$((bytes / 1024))KB"
elif [ "$bytes" -lt 1073741824 ]; then
echo "$((bytes / 1048576))MB"
else
echo "$((bytes / 1073741824))GB"
fi
}
# 格式化百分比
format_percent() {
local numerator=$1
local denominator=$2
if [ "$denominator" -eq 0 ]; then
echo "0"
else
echo "$((numerator * 100 / denominator))"
fi
}
# ==================== 检测函数 ====================
# 检查服务端口
check_port() {
local port=$1
local timeout=${2:-3}
timeout "$timeout" bash -c "cat < /dev/tcp/127.0.0.1/$port" 2>/dev/null
return $?
}
# 检查进程是否存在
check_process() {
local process_name=$1
pgrep -x "$process_name" >/dev/null
return $?
}
# 检查文件是否存在并可读
check_file() {
local file=$1
[ -r "$file" ]
return $?
}
```
---
## 三、功能模块映射
### 3.1 系统基础检测模块
| 模块编号 | 脚本名称 | 对应原函数 | 输出键名示例 |
|:---|:---|:---|:---|
| 01 | system_basic.sh | Test-SystemBasicInfo | HOSTNAME, OS_VERSION, KERNEL_VERSION, UPTIME_DAYS, LOAD_1MIN, LOAD_5MIN, LOAD_15MIN |
| 02 | cpu_check.sh | Test-CPUResource | CPU_CORES, CPU_USAGE, CPU_USER, CPU_SYSTEM, PROCESS_TOP5, INTERRUPT_TOTAL, SOFTIRQ_TOTAL |
| 03 | memory_check.sh | Test-MemoryResource | MEMORY_TOTAL, MEMORY_USED, MEMORY_FREE, MEMORY_USAGE_PERCENT, SWAP_TOTAL, SWAP_USED, PRESSURE_STATUS |
| 04 | disk_check.sh | Test-DiskResource | DISK_MOUNT, DISK_TOTAL, DISK_USED, DISK_PERCENT, DISK_INODE_PERCENT, DISK_IO_BUSY |
| 05 | oom_check.sh | Test-OOMAndKernel | OOM_COUNT, PANIC_COUNT, CORE_DUMP_COUNT, FS_READONLY |
| 06 | process_check.sh | Test-ProcessStatus | ZOMBIE_COUNT, THREAD_TOTAL, THREAD_TOP10, FD_USAGE_PERCENT, PROCESS_FD_TOP10, PROCESS_CONN_TOP10 |
| 07 | network_check.sh | Test-NetworkStatus | TCP_ESTABLISHED, TCP_TIME_WAIT, TCP_CLOSE_WAIT, NET_INTERFACE_COUNT, ARP_COUNT, ROUTE_COUNT, DNS_STATUS |
| 08 | security_check.sh | Test-SecurityStatus | SELINUX_STATUS, FIREWALL_STATUS, AUTH_FAILED_24H, SUSPICIOUS_SUID, LAST_LOGIN_COUNT, SSH_CONFIG_STATUS |
| 09 | system_logs.sh | Test-SystemLogs | KERNEL_ERROR_24H, AUTH_FAILED_COUNT, DISK_ERROR_24H, SYSTEMD_FAILED_COUNT, PANIC_COUNT, CRASH_COUNT, SSH_ATTACK_COUNT |
| 10 | time_sync.sh | Test-TimeSync | NTP_STATUS, CHRONY_SOURCE, TIME_OFFSET, SSL_CERT_DAYS, EMQX_CERT_DAYS |
| 11 | scheduled_tasks.sh | Test-ScheduledTasks | CRON_COUNT, SYSTEMD_TIMER_COUNT |
| 12 | port_check.sh | Test-PortService | PORT_22, PORT_443, PORT_8306, PORT_6379, PORT_1883, PORT_8080, PORT_8999 |
### 3.2 服务层检测模块
| 模块编号 | 脚本名称 | 对应原函数 | 输出键名示例 |
|:---|:---|:---|:---|
| 20 | docker_basic.sh | Test-DockerStatus | CONTAINER_RUNNING_COUNT, CONTAINER_TOTAL_COUNT, CONTAINER_RESTART_LIST, DOCKER_DISK_USAGE |
| 21 | docker_deep.sh | (新增) | CONTAINER_HEALTH_LIST, CONTAINER_LIMIT_LIST, CONTAINER_LOG_SIZE_LIST, DOCKER_EVENT_COUNT |
| 22 | mysql_basic.sh | Test-MySQLStatus | MYSQL_VERSION, MYSQL_UPTIME, CONNECTIONS_CURRENT, CONNECTIONS_MAX, CONNECTIONS_PERCENT, SLOW_QUERIES, QPS, TPS, DEADLOCKS |
| 23 | mysql_depth.sh | Test-MySQLStatus (深度) | INNODB_STATUS, BUFFER_POOL_HIT_RATE, TABLE_FRAG_COUNT, ACTIVE_QUERIES, BINLOG_STATUS, REPLICATION_STATUS, DATABASE_LIST, DATABASE_SIZE |
| 24 | redis_basic.sh | Test-RedisStatus | REDIS_VERSION, REDIS_UPTIME_DAYS, CONNECTED_CLIENTS, USED_MEMORY, MAXMEMORY, KEY_COUNT, MEMORY_FRAG_RATE |
| 25 | redis_depth.sh | Test-RedisStatus (深度) | KEYSPACE_DETAIL, PERSISTENCE_STATUS, REPLICATION_STATUS, CLUSTER_STATUS, SLOW_LOG_TOP10, COMMAND_STATS_TOP10, CLIENT_DETAIL, CACHE_HIT_RATE, CONFIG_CHECK |
| 26 | emqx_basic.sh | Test-EMQXStatus | EMQX_VERSION, EMQX_UPTIME, CONNECTIONS_COUNT, SESSIONS_COUNT, SUBSCRIPTIONS_COUNT, RETAINED_MESSAGES |
| 27 | emqx_deep.sh | (新增) | CLIENT_DETAIL, MESSAGE_STATS, DROPPED_STATS, TOPIC_COUNT, CLUSTER_STATUS, ALARM_LIST |
| 28 | java_check.sh | Test-JavaApplication | JAVA_VERSION, JVM_UPTIME, THREAD_COUNT, BLOCKED_THREAD_COUNT, WAITING_THREAD_COUNT, GC_FULL_COUNT, HEAP_MEMORY_USAGE |
| 29 | python_check.sh | (新增) | PYTHON_CONTAINER_STATUS, PYTHON_ERROR_COUNT, PYTHON_ERROR_TYPES |
| 30 | nginx_check.sh | (新增) | NGINX_VERSION, NGINX_CONFIG_STATUS, NGINX_WORKER_COUNT, NGINX_ERROR_COUNT, NGINX_SLOW_REQUESTS |
| 31 | nacos_check.sh | (新增) | NACOS_VERSION, NACOS_CONTAINER_STATUS, NACOS_ERROR_COUNT |
| 32 | fastdfs_check.sh | Test-FastDFSStatus | TRACKER_STATUS, STORAGE_STATUS, STORAGE_SPACE, SYNC_STATUS |
| 33 | app_logs.sh | Test-ApplicationLogs | JAVA_ERROR_COUNT, PYTHON_ERROR_COUNT, NGINX_ERROR_COUNT, NACOS_ERROR_COUNT, ERROR_FREQ_STATS |
---
## 四、实施计划
### 4.1 第一阶段:基础设施搭建(第1-2周)
**任务清单**:
- [ ] 创建目录结构
- [ ] 实现 `lib/config.sh` 配置文件
- [ ] 实现 `lib/common.sh` 通用函数库
- [ ] 修改主脚本,实现模块加载框架
- [ ] 实现模块上传和执行机制
- [ ] 实现结果解析框架
**验收标准**:
- 目录结构完整
- 配置文件可正常加载
- 通用函数可正常调用
- 主脚本可成功上传并执行一个测试模块
### 4.2 第二阶段:系统模块拆分(第3-4周)
**任务清单**:
- [ ] 拆分 `01_system_basic.sh` - 系统基础信息
- [ ] 拆分 `02_cpu_check.sh` - CPU资源检测
- [ ] 拆分 `03_memory_check.sh` - 内存资源检测
- [ ] 拆分 `04_disk_check.sh` - 磁盘资源检测
- [ ] 拆分 `05_oom_check.sh` - OOM和内核异常检测
- [ ] 拆分 `06_process_check.sh` - 进程状态检测
- [ ] 拆分 `07_network_check.sh` - 网络连接检测
- [ ] 拆分 `08_security_check.sh` - 安全合规检测
- [ ] 拆分 `09_system_logs.sh` - 系统日志检测
- [ ] 拆分 `10_time_sync.sh` - 时间同步检测
- [ ] 拆分 `11_scheduled_tasks.sh` - 定时任务检测
- [ ] 拆分 `12_port_check.sh` - 端口服务检测
**验收标准**:
- 所有系统模块可独立执行
- 所有系统模块输出格式符合规范
- 主脚本可成功调用所有系统模块
- 检测结果与原脚本一致
### 4.3 第三阶段:服务模块拆分(第5-7周)
**任务清单**:
- [ ] 拆分 `20_docker_basic.sh` - Docker基础检测
- [ ] 拆分 `21_docker_deep.sh` - Docker深度检测
- [ ] 拆分 `22_mysql_basic.sh` - MySQL基础检测
- [ ] 适配 `23_mysql_depth.sh` - MySQL深度检测(已存在)
- [ ] 拆分 `24_redis_basic.sh` - Redis基础检测
- [ ] 适配 `25_redis_depth.sh` - Redis深度检测(已存在)
- [ ] 拆分 `26_emqx_basic.sh` - EMQX基础检测
- [ ] 拆分 `27_emqx_deep.sh` - EMQX深度检测
- [ ] 拆分 `28_java_check.sh` - Java应用检测
- [ ] 拆分 `29_python_check.sh` - Python应用检测
- [ ] 拆分 `30_nginx_check.sh` - Nginx应用检测
- [ ] 拆分 `31_nacos_check.sh` - Nacos应用检测
- [ ] 拆分 `32_fastdfs_check.sh` - FastDFS检测
- [ ] 拆分 `33_app_logs.sh` - 应用日志分析
**验收标准**:
- 所有服务模块可独立执行
- 所有服务模块输出格式符合规范
- 主脚本可成功调用所有服务模块
- 检测结果与原脚本一致
- MySQL/Redis深度检测脚本适配完成
### 4.4 第四阶段:测试和优化(第8周)
**任务清单**:
- [ ] 完整功能测试(端到端)
- [ ] 性能测试(执行时间对比)
- [ ] 错误处理测试(各种异常场景)
- [ ] 编写使用文档
- [ ] 代码审查和优化
**验收标准**:
- 所有功能测试通过
- 执行时间不比原脚本差
- 异常场景有合适的错误提示
- 文档完整清晰
---
## 五、技术规范
### 5.1 Shell脚本编码规范
#### 5.1.1 命名规范
```bash
# 函数命名:小写字母+下划线
get_cpu_usage() { }
check_container_running() { }
# 变量命名:大写字母+下划线
CPU_CORES=8
MEMORY_TOTAL=1024
# 常量命名:全大写+下划线
MYSQL_PASSWORD="dNrprU&2S"
MAX_CONNECTIONS=500
```
#### 5.1.2 错误处理规范
```bash
# 设置错误时退出
set -e
# 错误处理函数
handle_error() {
local line_no=$1
echo "ERROR:在第 $line_no 行发生错误"
exit 1
}
trap 'handle_error $LINENO' ERR
```
#### 5.1.3 代码结构规范
```bash
#!/bin/bash
################################################################################
# 模块说明
################################################################################
# 加载依赖
source "$LIB_DIR/config.sh"
source "$LIB_DIR/common.sh"
# 主函数
main() {
# 1. 检查前置条件
# 2. 执行检测
# 3. 输出结果
}
# 执行主函数
main "$@"
```
### 5.2 PowerShell脚本编码规范
#### 5.2.1 函数命名规范
```powershell
# 函数命名:PascalCase
function Invoke-ModuleCheck { }
function Parse-ModuleResult { }
function Convert-ToResultObject { }
# 变量命名:PascalCase(全局)或camelCase(局部)
$Script:检测结果 = @()
$moduleName = "cpu_check"
```
#### 5.2.2 注释规范
```powershell
################################################################################
# 函数说明
################################################################################
<#
.SYNOPSIS
执行检测模块
.DESCRIPTION
上传模块到远程服务器并执行,返回结果
.PARAMETER ModuleName
模块名称(如: cpu_check.sh)
.PARAMETER Category
模块分类(system/service)
.EXAMPLE
Invoke-ModuleCheck -ModuleName "02_cpu_check.sh" -Category "system"
#>
function Invoke-ModuleCheck {
param(...)
}
```
---
## 六、兼容性考虑
### 6.1 向后兼容性
**保持功能一致**:
- 模块化后检测功能与原脚本完全一致
- 报告格式保持不变
- 用户交互方式保持不变
- 输出字段名称保持一致
**平滑迁移**:
- 保留原脚本作为 `check_server_health_legacy.ps1`
- 新脚本为 `check_server_health.ps1`
- 用户可自由选择使用哪个版本
### 6.2 扩展性设计
**新增模块步骤**:
1.`lib/system/``lib/service/` 创建新的Shell脚本
2. 按照规范实现检测逻辑
3. 在主脚本的模块配置中注册新模块
4. 无需修改主脚本其他部分
**模块独立性**:
- 新模块不影响现有模块
- 模块之间不相互依赖
- 可选择性执行某些模块
---
## 七、测试策略
### 7.1 单元测试
**Shell模块测试**:
```bash
# 直接执行模块测试
./02_cpu_check.sh
# 验证输出格式
./02_cpu_check.sh | grep -E "^[A-Z_]+:"
```
**PowerShell函数测试**:
```powershell
# 测试结果解析
$result = "CPU_USAGE:45.6`nCPU_CORES:8"
Parse-ModuleResult $result "cpu_check"
```
### 7.2 集成测试
**端到端测试流程**:
1. 运行新脚本,输入服务器信息
2. 等待所有模块执行完成
3. 对比新旧脚本的报告内容
4. 验证检测结果一致性
### 7.3 性能测试
**性能指标**:
- 模块上传时间: <10秒
- 单个模块执行时间: <30秒
- 总执行时间: 不超过原脚本120%
---
## 八、风险管理
### 8.1 技术风险
| 风险 | 影响 | 应对措施 |
|:---|:---|:---|
| SSH连接不稳定 | 模块上传失败 | 增加重试机制,支持断点续传 |
| 远程服务器权限不足 | 模块无法执行 | 前置权限检查,清晰的错误提示 |
| Shell脚本兼容性 | 不同Linux发行版差异 | 使用POSIX标准,避免发行版特定命令 |
| 解析错误 | 输出格式变化导致解析失败 | 严格的格式验证,异常容错处理 |
### 8.2 进度风险
| 风险 | 影响 | 应对措施 |
|:---|:---|:---|
| 工作量评估不足 | 无法按期完成 | 分阶段实施,优先高价值模块 |
| 人员变动 | 知识流失 | 详细的文档和代码注释 |
| 需求变更 | 返工 | 模块化设计便于应对变更 |
---
## 九、成功标准
### 9.1 功能成功标准
- [ ] 所有检测功能成功拆分为独立模块
- [ ] 新脚本检测功能与原脚本完全一致
- [ ] 报告格式和内容与原脚本保持一致
- [ ] 模块可独立执行和测试
### 9.2 质量成功标准
- [ ] 代码符合规范要求
- [ ] 所有模块有适当的错误处理
- [ ] 代码有清晰的注释和文档
- [ ] 通过完整的端到端测试
### 9.3 维护性成功标准
- [ ] 新增功能可在30分钟内完成
- [ ] 模块修改不影响其他模块
- [ ] 问题定位时间减少50%以上
---
## 十、附录
### 附录A:模块开发模板
```bash
#!/bin/bash
################################################################################
# 模块名称: [功能名称]检测
# 功能描述: [详细描述检测内容]
# 输出格式: KEY:VALUE
# 作者: [作者名称]
# 创建日期: [日期]
# 依赖: config.sh, common.sh
################################################################################
# 获取脚本目录
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LIB_DIR="$(dirname "$SCRIPT_DIR")/lib"
# 加载依赖
source "$LIB_DIR/config.sh"
source "$LIB_DIR/common.sh"
################################################################################
# 主函数
################################################################################
main() {
local error_count=0
# 1. 检测项1
local value1=$(get_value1)
if [ -z "$value1" ]; then
echo "ERROR:无法获取value1"
((error_count++))
else
echo "KEY1:$value1"
fi
# 2. 检测项2
local value2=$(get_value2)
if [ -z "$value2" ]; then
echo "ERROR:无法获取value2"
((error_count++))
else
echo "KEY2:$value2"
fi
# 返回错误码
return $error_count
}
# 执行主函数
main "$@"
```
### 附录B:输出键名规范
**键名命名规则**:
- 使用大写字母和下划线
- 使用有意义的英文缩写或全拼
- 同类数据使用统一前缀
**常用前缀**:
- `CPU_*` - CPU相关指标
- `MEMORY_*` - 内存相关指标
- `DISK_*` - 磁盘相关指标
- `NET_*` - 网络相关指标
- `CONN_*` - 连接相关指标
- `PROCESS_*` - 进程相关指标
- `ERROR_*` - 错误信息
### 附录C:进度跟踪表
| 阶段 | 任务 | 状态 | 完成日期 | 负责人 |
|:---|:---|:---|:---|:---|
| 第一阶段 | 目录结构创建 | 待开始 | - | - |
| 第一阶段 | config.sh实现 | 待开始 | - | - |
| 第一阶段 | common.sh实现 | 待开始 | - | - |
| 第一阶段 | 主脚本框架改造 | 待开始 | - | - |
| 第二阶段 | 系统模块拆分 | 待开始 | - | - |
| 第三阶段 | 服务模块拆分 | 待开始 | - | - |
| 第四阶段 | 测试和优化 | 待开始 | - | - |
---
**文档版本历史**:
- v1.0 (2026-05-09) - 初始版本,定义模块化拆分方案
**审批记录**:
- [ ] 待评审
- [ ] 待批准
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论