提交 658711d3 authored 作者: 陈泽健's avatar 陈泽健

feat(server): 增强服务器健康监测模块功能

- 更新模块分类规则,调整Docker和MySQL模块编号范围
- 新增综合诊断模块支持,添加40_综合诊断模块前缀识别
- 扩展显示名称映射,增加Redis、MySQL、EMQX等组件的基础指标
- 完善阈值配置体系,添加系统、Docker、MySQL、Redis、EMQX等多组件阈值
- 增加新的状态判断逻辑,支持运行中、异常、未运行、已停止等状态识别
- 引入深度检测模块,新增21_Docker深度检测等功能模块
- 重构报告生成逻辑,优化Markdown报告格式和内容结构
- 增加综合诊断数据文件生成功能,支持远程数据传输处理
上级 738196ed
......@@ -343,8 +343,8 @@ function Get-ModuleCategory {
switch -Regex ($ModuleName) {
"^(0[1-9]|1[0-2])_" { return "系统基础信息" }
"^20_" { return "Docker容器" }
"^2[123]_" { return "MySQL数据库" }
"^(20|21)_" { return "Docker容器" }
"^2[23]_" { return "MySQL数据库" }
"^2[45]_" { return "Redis缓存" }
"^2[67]_" { return "EMQX消息队列" }
"^28_" { return "Java应用" }
......@@ -353,6 +353,7 @@ function Get-ModuleCategory {
"^31_" { return "Nacos应用" }
"^32_" { return "FastDFS应用" }
"^33_" { return "应用日志" }
"^40_" { return "综合诊断" }
default { return "其他" }
}
}
......@@ -400,6 +401,7 @@ function Get-DisplayName {
)
$displayNames = @{
# 系统基础信息
"HOSTNAME" = "主机名"
"OS_VERSION" = "操作系统版本"
"KERNEL_VERSION" = "内核版本"
......@@ -410,6 +412,74 @@ function Get-DisplayName {
"CPU_USAGE" = "CPU使用率"
"MEMORY_USAGE" = "内存使用率"
"DISK_USAGE" = "磁盘使用率"
# Redis基础
"KEY_COUNT" = "Redis键数量"
"CLIENT_COUNT" = "Redis客户端数"
"CACHE_HIT_RATE" = "Redis缓存命中率"
"REDIS_VERSION" = "Redis版本"
"REDIS_MEMORY_USED" = "Redis内存使用"
"REDIS_MEM_FRAGMENTATION" = "Redis内存碎片率"
"REJECTED_CONNECTIONS" = "Redis拒绝连接数"
"SLOW_LOG_TOP10" = "Redis慢日志TOP10"
"PERSISTENCE_STATUS" = "Redis持久化状态"
"REPLICATION_STATUS" = "Redis复制状态"
"CLIENT_DETAIL" = "Redis客户端详情"
"CLUSTER_STATUS" = "Redis集群状态"
# MySQL基础
"CONNECTIONS" = "MySQL连接数信息"
"SLOW_QUERIES" = "MySQL慢查询数"
"MYSQL_VERSION" = "MySQL版本"
"MYSQL_CONNECTIONS_CURRENT" = "MySQL当前连接数"
"MYSQL_CONNECTIONS_USAGE" = "MySQL连接使用率"
"MYSQL_INNODB_HIT_RATE" = "MySQL InnoDB命中率"
"QPS" = "MySQL QPS"
"TPS" = "MySQL TPS"
"DEADLOCKS" = "MySQL死锁数"
"BUFFER_POOL_HIT_RATE" = "MySQL缓冲池命中率"
"ACTIVE_QUERIES" = "MySQL活跃查询数"
"FRAGMENTED_TABLES" = "MySQL碎片表数量"
"CONN_ERRORS" = "MySQL连接错误数"
"TRX_ACTIVE" = "MySQL活跃事务数"
"LOCK_WAITS" = "MySQL锁等待数"
"THREADS_POOL" = "MySQL线程池状态"
"TEMP_TABLE_RATE" = "MySQL临时表使用率"
"DATABASE_SIZE" = "MySQL数据库大小"
"TABLE_COUNT" = "MySQL表数量"
"TABLES_WITHOUT_INDEX" = "MySQL无索引表数量"
"SLOW_QUERY_TOP1" = "MySQL最慢查询统计"
"LOCK_DETAIL" = "MySQL锁详情"
"FRAGMENTED_DETAIL" = "MySQL碎片详情"
# EMQX基础
"EMQX_CLIENTS_TOTAL" = "EMQX客户端总数"
"EMQX_CLIENTS_CONNECTED" = "EMQX已连接客户端"
"EMQX_CLIENTS_DISCONNECTED" = "EMQX已断开客户端"
"EMQX_SUBSCRIPTIONS_TOTAL" = "EMQX订阅总数"
"EMQX_TOPICS_TOTAL" = "EMQX主题总数"
"EMQX_ROUTES_TOTAL" = "EMQX路由总数"
"EMQX_LISTENERS_COUNT" = "EMQX监听器数量"
"EMQX_SESSIONS_TOTAL" = "EMQX会话总数"
"EMQX_SESSIONS_ACTIVE" = "EMQX活跃会话"
"EMQX_SESSIONS_INACTIVE" = "EMQX非活跃会话"
"EMQX_NODE_STATUS" = "EMQX节点状态"
"EMQX_NODE_LEVEL" = "EMQX节点状态等级"
"EMQX_CLUSTER_MODE" = "EMQX集群模式"
"EMQX_CLUSTER_NODES" = "EMQX集群节点数"
"EMQX_MAX_CONNECTIONS" = "EMQX最大连接数"
"EMQX_CONN_USAGE" = "EMQX连接使用率"
"EMQX_CONN_LEVEL" = "EMQX连接状态等级"
"EMQX_MEMORY_USAGE" = "EMQX内存使用率"
"EMQX_PLUGINS_COUNT" = "EMQX插件数量"
"EMQX_RULES_COUNT" = "EMQX规则数量"
"EMQX_ALARMS_COUNT" = "EMQX告警数量"
"EMQX_ALARMS_LEVEL" = "EMQX告警等级"
"EMQX_MESSAGES_SENT" = "EMQX发送消息数"
"EMQX_MESSAGES_RECEIVED" = "EMQX接收消息数"
"EMQX_VERSION" = "EMQX版本"
# 通用状态
"DOCKER_STATUS" = "Docker状态"
"MYSQL_STATUS" = "MySQL状态"
"REDIS_STATUS" = "Redis状态"
......@@ -434,9 +504,224 @@ function Get-Threshold {
)
$thresholds = @{
# ==================== 系统基础信息阈值 ====================
"CPU使用率" = ">85%"
"CPU_USAGE" = ">85%"
"内存使用率" = ">85%"
"MEMORY_USAGE" = ">85%"
"DISK_USAGE" = ">90%"
"SWAP_USAGE" = ">20%"
"SWAP使用率" = ">20%"
"SWAP_USED" = ">20%"
"1分钟负载" = ">8"
"LOAD_1MIN" = ">8"
"5分钟负载" = ">8"
"LOAD_5MIN" = ">8"
"15分钟负载" = ">8"
"LOAD_15MIN" = ">8"
"THREAD_COUNT" = ">1000"
"线程总数" = ">1000"
"FD_USAGE" = ">80%"
"文件描述符使用率" = ">80%"
"ZOMBIE_COUNT" = ">0"
"僵尸进程数" = ">0"
"TCP_TIME_WAIT" = ">500"
"TIME_WAIT连接数" = ">500"
"TCP_CLOSE_WAIT" = ">100"
"CLOSE_WAIT连接数" = ">100"
# ==================== Docker容器阈值 ====================
"DOCKER_LARGE_LOGS" = ">500MB"
"DOCKER_IMAGES_DANGLING" = ">5"
"LOG_CARDTABLE_SIZE" = ">500MB"
"LOG_PAPERLESS_SIZE" = ">500MB"
"LOG_UPYTHON_VOICE_SIZE" = ">500MB"
"LOG_UPYTHON_SIZE" = ">500MB"
"LOG_UJAVA2_SIZE" = ">500MB"
"LOG_UNGINX_SIZE" = ">500MB"
"LOG_UNGROK_SIZE" = ">500MB"
"LOG_USTORAGE_SIZE" = ">500MB"
"LOG_UTRACKER_SIZE" = ">500MB"
"LOG_UNACOS_SIZE" = ">500MB"
"LOG_UEMQX_SIZE" = ">500MB"
"LOG_UREDIS_SIZE" = ">500MB"
"LOG_UMYSQL_SIZE" = ">500MB"
# ==================== MySQL阈值 ====================
"MYSQL_CONNECTIONS_USAGE" = ">80%"
"MYSQL_CONNECTIONS_LEVEL" = ">80%"
"MySQL连接使用率" = ">80%"
"MYSQL_SLOW_QUERIES" = ">100"
"MYSQL慢查询数" = ">100"
"SLOW_QUERIES" = ">100"
"MYSQL_TABLE_USAGE" = ">90%"
"MYSQL表缓存使用率" = ">90%"
"MYSQL_INNODB_HIT_RATE" = "<95%"
"MYSQL_INNODB命中率" = "<95%"
"MYSQL_CONNECTIONS_CURRENT" = ">400"
"MySQL当前连接数" = ">400"
# ==================== Redis阈值 ====================
"REDIS_KEYS" = ">1000000"
"REDIS_KEYS_LEVEL" = ">1000000"
"REDIS键数量" = ">1000000"
"KEY_COUNT" = ">1000000"
"REDIS_MEM_FRAGMENTATION" = ">5"
"REDIS_MEM_FRAG_LEVEL" = ">5"
"REDIS内存碎片率" = ">5"
"REDIS_CLIENTS" = ">500"
"REDIS客户端数" = ">500"
"CLIENT_COUNT" = ">500"
"CACHE_HIT_RATE" = "<90%"
"Redis缓存命中率" = "<90%"
# ==================== EMQX阈值 ====================
"EMQX_CLIENTS" = ">1000"
"EMQX客户端数" = ">1000"
"EMQX_CONNECTIONS" = ">1000"
"EMQX_SUBSCRIPTIONS" = ">5000"
"EMQX订阅数" = ">5000"
"EMQX_SESSIONS" = ">1000"
"EMQX_TOPICS" = ">100"
"EMQX_ROUTES" = ">1000"
"EMQX_LISTENERS" = "<1"
"EMQX_CLUSTER_NODES" = "<1"
"EMQX_SESSIONS_ACTIVE" = "<1"
"EMQX_CLIENTS_TOTAL" = ">1000"
"EMQX_CLIENTS_CONNECTED" = ">1000"
"EMQX_SUBSCRIPTIONS_TOTAL" = ">5000"
"EMQX_TOPICS_TOTAL" = ">100"
"EMQX_ROUTES_TOTAL" = ">1000"
"EMQX_LISTENERS_COUNT" = "<1"
"EMQX_SESSIONS_TOTAL" = ">1000"
# ==================== Java应用阈值 ====================
"JAVA_THREADS" = ">500"
"JAVA_THREAD_COUNT" = ">500"
"Java线程数" = ">500"
"JAVA_LOG_ERRORS" = ">10"
# ==================== 综合诊断阈值 ====================
"DIAG_SWAP_LEVEL" = ">20%"
"DIAG_ZOMBIE_LEVEL" = ">0"
"DIAG_TIMEWAIT_LEVEL" = ">500"
"DIAG_LOAD_1MIN" = ">8"
"DIAG_MEMORY_USAGE" = ">85%"
"DIAG_ZOMBIE" = ">0"
"DIAG_TIMEWAIT" = ">500"
"DIAG_MYSQL_SLOW" = ">100"
"DIAG_MYSQL_SLOW_LEVEL" = ">100"
# ==================== 系统日志阈值 ====================
"APP_LOG_ERRORS_24H" = ">50"
"APP_LOG_LEVEL" = ">50"
"AUTH_FAILURES_24H" = ">100"
"KERNEL_ERRORS_24H" = ">10"
"DISK_ERRORS_24H" = ">5"
"OOM_LOGS_7D" = ">1"
"OOM_COUNT" = ">1"
"LARGE_LOG_FILES" = ">0"
"CRON_ERRORS_24H" = ">20"
# ==================== Python应用阈值 ====================
"PYTHON_PROCESSES" = ">100"
"PYTHON_MEMORY" = ">80%"
# ==================== Nacos应用阈值 ====================
"NACOS_MEMORY" = ">80%"
# ==================== Redis额外阈值 ====================
"REDIS_MEMORY_USED" = ">1GB"
"REDIS_BLOCKED_CLIENTS" = ">10"
# ==================== Docker额外阈值 ====================
"DOCKER_IMAGES_COUNT" = ">20"
"DOCKER_VOLUMES_COUNT" = ">10"
"DOCKER_NETWORK_COUNT" = ">5"
# ==================== 系统基础信息额外阈值 ====================
"PROCESS_COUNT" = ">500"
"TCP_ESTABLISHED" = ">500"
"KERNEL_ERRORS" = ">5"
"CORE_DUMP_COUNT" = ">1"
"AUTH_FAIL_COUNT" = ">50"
"NET_ERRORS" = ">0"
"LOAD_RATIO" = ">1"
"BOOT_FAILED_SERVICES" = ">0"
"LOGGED_USERS" = ">5"
"PASS_MAX_DAYS" = ">90"
# ==================== 端口连接数阈值 ====================
"PORT_MySQL_CONNECTIONS" = ">100"
"PORT_Redis_CONNECTIONS" = ">200"
"PORT_EMQX_MQTT_CONNECTIONS" = ">100"
"PORT_HTTP_CONNECTIONS" = ">50"
"PORT_HTTPS_CONNECTIONS" = ">50"
# ==================== MySQL额外阈值 ====================
"CONN_ERRORS" = ">100"
"DATABASE_SIZE" = ">10GB"
"QPS" = ">1000"
"TPS" = ">100"
"DEADLOCKS" = ">0"
# ==================== EMQX额外阈值 ====================
"EMQX_CLIENTS_DISCONNECTED" = ">50"
"EMQX_MAX_CONNECTIONS" = "<1000"
"EMQX_CONN_USAGE" = ">80%"
"EMQX_CONN_LEVEL" = ">80%"
"EMQX_MEMORY_USAGE" = ">80%"
"EMQX_SESSIONS_INACTIVE" = ">100"
"EMQX_PLUGINS_COUNT" = "<1"
"EMQX_RULES_COUNT" = "未启用"
"EMQX_ALARMS_COUNT" = ">0"
"EMQX_ALARMS_LEVEL" = ">0"
"EMQX_MESSAGES_SENT" = ">1000000"
"EMQX_MESSAGES_RECEIVED" = ">1000000"
"EMQX_CLUSTER_MODE" = "否"
"EMQX_LISTENER_PORTS" = "<1"
# ==================== Redis额外阈值(扩展) ====================
"REJECTED_CONNECTIONS" = ">50"
"SLOW_LOG_TOP10_Count" = ">5"
"SLOW_LOG_TOP10_Slowest" = ">100000"
"CLIENT_DETAIL_IdleOver5min" = ">10"
"CLIENT_DETAIL_Blocking" = ">0"
"PERSISTENCE_STATUS" = "备份中"
"CLUSTER_STATUS" = "!OK"
"KEY_TYPE_DISTRIBUTION" = "N/A"
# ==================== MySQL额外阈值(扩展) ====================
"ACTIVE_QUERIES" = ">50"
"FRAGMENTED_TABLES" = ">5"
"TRX_ACTIVE" = ">10"
"LOCK_WAITS" = ">0"
"THREADS_POOL" = ">50"
"TEMP_TABLE_RATE" = ">30%"
"ACTIVE_PROCESSLIST_LongRunning" = ">5"
"TABLES_WITHOUT_INDEX" = ">0"
"SLOW_QUERY_TOP1_TotalTime" = ">10"
"LOCK_DETAIL_Waits" = ">0"
"FRAGMENTED_DETAIL_Count" = ">5"
"BUFFER_POOL_HIT_RATE" = "<95%"
# ==================== 日志系统阈值 ====================
"JOURNAL_DISK_USAGE" = ">500M"
# ==================== 端口检测阈值 ====================
"OPEN_PORTS" = ">100"
"OPEN_PORTS_LEVEL" = ">100"
# ==================== 安全检测阈值 ====================
"SUSPICIOUS_SUID_COUNT" = ">5"
"FAILED_SERVICES" = ">0"
# ==================== 时间同步阈值 ====================
"CLOCK_OFFSET" = ">1秒"
"时钟偏差" = ">1秒"
# ==================== 证书检测阈值 ====================
"SSL_CERT_DAYS_LEFT" = "<30天"
"EMQX_CERT_DAYS_LEFT" = "<30天"
}
if ($thresholds.ContainsKey($Key)) {
......@@ -453,13 +738,17 @@ function Get-StatusByValue {
[string]$Value
)
if ($Value -match "^(正常|警告|严重|ERROR|OK)$") {
if ($Value -match "^(正常|警告|严重|ERROR|OK|运行中|异常|未运行|已停止)$") {
switch ($Value) {
"正常" { return "正常" }
"警告" { return "警告" }
"严重" { return "严重" }
"OK" { return "正常" }
"ERROR" { return "严重" }
"运行中" { return "正常" }
"异常" { return "严重" }
"未运行" { return "严重" }
"已停止" { return "警告" }
default { return "正常" }
}
}
......@@ -542,15 +831,21 @@ function Invoke-AllChecks {
"10_time_sync.sh", "11_scheduled_tasks.sh", "12_port_check.sh"
)
# 综合诊断模块(在所有模块之后执行)
$comprehensiveModules = @(
"40_comprehensive_diagnosis.sh"
)
# 服务模块列表
$serviceModules = @(
"20_docker_basic.sh", "22_mysql_basic.sh", "24_redis_basic.sh",
"26_emqx_basic.sh", "28_java_check.sh", "29_python_check.sh",
"30_nginx_check.sh", "31_nacos_check.sh", "32_fastdfs_check.sh",
"33_app_logs.sh"
"20_docker_basic.sh", "21_docker_deep.sh", "22_mysql_basic.sh",
"23_mysql_depth.sh", "24_redis_basic.sh", "25_redis_depth.sh",
"26_emqx_basic.sh", "27_emqx_deep.sh", "28_java_check.sh",
"29_python_check.sh", "30_nginx_check.sh", "31_nacos_check.sh",
"32_fastdfs_check.sh", "33_app_logs.sh"
)
$totalModules = $systemModules.Count + $serviceModules.Count
$totalModules = $systemModules.Count + $serviceModules.Count + $comprehensiveModules.Count
$currentModule = 0
# 执行系统模块
......@@ -587,6 +882,47 @@ function Invoke-AllChecks {
}
}
Write-Log ""
Write-Log "--- 生成综合诊断数据文件 ---" "INFO"
# 生成综合诊断所需的当前数据文件
$dataFilePath = "$modulePath/current_data.txt"
$dataContent = ""
foreach ($category in $script:TestResults.Keys) {
$items = $script:TestResults[$category]
foreach ($item in $items) {
$dataContent += "$($item.Name):$($item.Value)`n"
}
}
# 通过SSH保存数据文件到远程服务器
$tempFile = [System.IO.Path]::GetTempFileName()
$dataContent | Out-File -FilePath $tempFile -Encoding UTF8 -Force
$pscpPath = Join-Path $scriptPath "pscp.exe"
& $pscpPath -P $script:Port -l $script:Username -pw $script:Password $tempFile "${script:Username}@${script:HostName}:$modulePath/current_data.txt" 2>&1 | Out-Null
Remove-Item $tempFile -Force
Write-Log "数据文件已生成: $dataFilePath"
# 执行综合诊断模块
Write-Log ""
Write-Log "--- 综合诊断检测 ---" "INFO"
foreach ($module in $comprehensiveModules) {
$currentModule++
Write-Host ""
Write-Host "[$currentModule/$totalModules] " -NoNewline
Write-Host "执行: $module" -ForegroundColor Cyan
$results = Invoke-ModuleCheck -ModuleName $module -Category "system"
$category = Get-ModuleCategory -ModuleName $module
foreach ($result in $results) {
Save-TestResult -Category $category -Result $result
}
}
Write-Log ""
Write-Log "========================================" "INFO"
Write-Log "所有检测模块执行完成!" "INFO"
......@@ -597,73 +933,243 @@ function Invoke-AllChecks {
function New-MarkdownReport {
$reportLines = @()
$reportLines += "# 服务器健康检测报告"
$reportLines += ""
$reportLines += "**生成时间**: $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')"
$reportLines += "**目标主机**: $script:HostName"
$reportLines += ""
# 获取系统基础信息
$basicInfo = @{}
if ($script:TestResults.ContainsKey("系统基础信息")) {
foreach ($item in $script:TestResults["系统基础信息"]) {
$basicInfo[$item.Key] = $item.Value
}
}
$reportLines += "## 执行摘要"
# 报告头部
$reportLines += "# 服务器健康巡检报告"
$reportLines += ""
$reportLines += "**时间:** $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')"
$hostLine = "**主机:** $script:HostName"
if ($basicInfo["HOSTNAME"]) {
$hostLine += " ($($basicInfo["HOSTNAME"]))"
}
$reportLines += $hostLine
if ($basicInfo["OS_VERSION"]) {
$reportLines += "**操作系统:** $($basicInfo["OS_VERSION"])"
}
if ($basicInfo["KERNEL_VERSION"]) {
$reportLines += "**内核:** $($basicInfo["KERNEL_VERSION"])"
}
if ($basicInfo["BOOT_TIME"]) {
$reportLines += "**启动时间:** $($basicInfo["BOOT_TIME"])"
}
if ($basicInfo["UPTIME_DAYS"]) {
$reportLines += "**运行时间:** $($basicInfo["UPTIME_DAYS"]) days"
}
# 总体状态
if ($script:CriticalIssues.Count -gt 0) {
$reportLines += "**总体状态**: 🔴 严重"
$reportLines += "**状态:** 🔴 严重"
}
elseif ($script:WarningIssues.Count -gt 0) {
$reportLines += "**总体状态**: 🟡 警告"
$reportLines += "**状态:** 🟡 警告"
}
else {
$reportLines += "**总体状态**: 🟢 正常"
$reportLines += "**状态:** 🟢 正常"
}
$reportLines += ""
$reportLines += "- 严重问题: $($script:CriticalIssues.Count)"
$reportLines += "- 警告问题: $($script:WarningIssues.Count)"
$reportLines += "---"
$reportLines += ""
# 核心问题诊断
$reportLines += "## 核心问题诊断"
if ($script:CriticalIssues.Count -gt 0) {
$reportLines += "### 🔴 严重问题"
$reportLines += ""
$reportLines += "### 严重问题 ($($script:CriticalIssues.Count)个)"
foreach ($issue in $script:CriticalIssues) {
$reportLines += "- $issue"
$reportLines += "+ 🔴 $issue"
}
$reportLines += ""
$reportLines += ""
}
if ($script:WarningIssues.Count -gt 0) {
$reportLines += "### 🟡 警告问题"
$reportLines += ""
$reportLines += "### 警告问题 ($($script:WarningIssues.Count)个)"
foreach ($issue in $script:WarningIssues) {
$reportLines += "- $issue"
$reportLines += "+ 🟡 $issue"
}
$reportLines += ""
$reportLines += ""
}
$reportLines += "**诊断摘要:** 关键问题: $($script:CriticalIssues.Count), 警告: $($script:WarningIssues.Count)"
$reportLines += ""
$reportLines += "---"
$reportLines += ""
# 系统基础信息表格
if ($script:TestResults.ContainsKey("系统基础信息")) {
$reportLines += "## 系统基础信息"
$reportLines += "| 项目 | 值 |"
$reportLines += "| :--- | :--- |"
$basicItems = @("HOSTNAME", "OS_VERSION", "KERNEL_VERSION", "BOOT_TIME", "UPTIME_DAYS", "CPU_CORES", "MEMORY_TOTAL")
$basicDisplayNames = @{
"HOSTNAME" = "主机名"
"OS_VERSION" = "操作系统"
"KERNEL_VERSION" = "内核版本"
"BOOT_TIME" = "启动时间"
"UPTIME_DAYS" = "运行时间"
"CPU_CORES" = "CPU核心数"
"MEMORY_TOTAL" = "总内存"
}
foreach ($key in $basicItems) {
if ($basicInfo.ContainsKey($key)) {
$name = $basicDisplayNames[$key]
$value = $basicInfo[$key]
$reportLines += "| $name | $value |"
}
}
# 添加系统负载
if ($basicInfo.ContainsKey("LOAD_1MIN")) {
$reportLines += "| 系统负载 | $($basicInfo["LOAD_1MIN"]) $($basicInfo["LOAD_5MIN"]) $($basicInfo["LOAD_15MIN"]) |"
}
$reportLines += ""
$reportLines += ""
}
# 模块检测结果
$moduleNumber = 0
foreach ($category in $script:TestResults.Keys) {
if ($category -eq "系统基础信息") { continue }
$items = $script:TestResults[$category]
if ($items.Count -eq 0) { continue }
$reportLines += "## $category"
$moduleNumber++
$reportLines += "---"
$reportLines += ""
$reportLines += "## 检测模块 $moduleNumber$category"
$reportLines += ""
$reportLines += "| 检测项 | 数值 | 阈值 | 状态 |"
$reportLines += "|:---|:---|:---|:---|"
# 过滤显示项
$displayedItems = @()
$seenItems = @{} # 用于去重
foreach ($item in $items) {
$status = Get-StatusIcon -Status $item.Status
$reportLines += "| $($item.Name) | $($item.Value) | $($item.Threshold) | $status |"
# 跳过错误信息和特殊项
if ($item.Name -match "^awk$" -or
$item.Name -match "cmd\.line" -or
$item.Name -match "unexpected" -or
$item.Value -match "^awk\|" -or
$item.Value -match "unexpected newline") {
continue
}
# 跳过容器详细信息和内部变量
if ($item.Name -match "^(CONTAINER_|LIMIT_|PROCESSES_)" -and
$item.Name -notmatch "_(STATUS|LEVEL)$") {
continue
}
# 跳过Docker详细项(保留状态)
if ($item.Name -match "^DOCKER_" -and
$item.Name -notmatch "_(STATUS|LEVEL)$") {
continue
}
# 跳过所有日志项(除了容器状态)
if ($item.Name -match "^LOG_") {
continue
}
# 跳过端口连接详细信息(保留状态)
if ($item.Name -match "^PORT_.*_CONNECTIONS$" -and $item.Name -notmatch "_STATUS$") {
continue
}
# 跳过长列表内容项
if ($item.Name -match "(TOP5|TOP10|TOP20|_LIST|_DETAIL|_DISTRIBUTION|_STATS|_CONFIG|_INFO|TOPICS|TOP1)$") {
continue
}
# 跳过纯配置类长列表项(但保留重要的状态和统计)
if ($item.Name -match "(LONG_QUERIES|DATABASE_LIST|UBAINS_TABLES|INNODB_BP|_LIST\$|TOP20|TOP10|TOP5)") {
continue
}
# 跳过冗余的详细信息(但有价值的指标保留)
if ($item.Name -match "^(MEMORY_INFO|CONFIG_CHECK|KEYSPACE_DETAIL|KEY_TYPE_DISTRIBUTION|COMMAND_STATS|ACTIVE_PROCESSLIST|REPLICATION_DETAIL|INNODB_TRX)$") {
continue
}
# 跳过纯数据统计类指标(但保留有阈值监控的指标)
if ($item.Name -match "^(REDIS_UPTIME_DAYS|REDIS_MEMORY_MAX|MYSQL_UPTIME_DAYS|MYSQL_CONNECTIONS_MAX|MYSQL_OPEN_TABLES|MYSQL_TABLE_CACHE|MYSQL_CHARSET|MYSQL_COLLATION)$") {
continue
}
# 去重:跳过已显示的项(基于Name)
if ($seenItems.ContainsKey($item.Name)) {
continue
}
$seenItems[$item.Name] = $true
# 跳过重复的版本信息(优先保留中文或更友好的名称)
if ($item.Name -match "^(REDIS_|MYSQL_|EMQX_|JAVA_|PYTHON_|NGINX_|NACOS_)VERSION$" -and
$seenItems.ContainsKey("版本")) {
continue
}
if ($item.Name -match "版本$") {
$seenItems["VERSION"] = $true
}
# 跳过重复的指标(保留英文键,过滤中文键)
if ($item.Name -match "^(运行天数|Redis键数量|Redis客户端数|MySQL连接数信息|MySQL慢查询数)$") {
continue
}
# 跳过MySQL模块中的Redis相关项
if ($category -eq "MySQL数据库" -and $item.Name -match "Redis") {
continue
}
$displayedItems += $item
}
# 显示过滤后的项
foreach ($item in $displayedItems) {
$reportLines += "+ **$($item.Name)**: $($item.Value) | 阈值: $($item.Threshold) | 状态: $(Get-StatusText -Status $item.Status) | 说明: -"
}
$reportLines += ""
$reportLines += ""
}
# 报告尾部
$reportLines += "---"
$reportLines += ""
$reportLines += "*本报告由服务器健康监测脚本 v4.0 自动生成*"
$reportLines += "_报告生成时间: $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')_"
$reportLines += "_服务器健康监测脚本 v4.0 (模块化架构)_"
$reportLines += "_检测模块数量: $($moduleNumber.ToString())_"
return $reportLines -join "`n"
}
# ==================== 获取状态文本 ====================
function Get-StatusText {
param(
[string]$Status
)
switch ($Status) {
"正常" { return "🟢 正常" }
"警告" { return "🟡 警告" }
"严重" { return "🔴 严重" }
default { return "⚪ 正常" }
}
}
# ==================== 获取状态图标 ====================
function Get-StatusIcon {
param(
......
#!/bin/bash
################################################################################
# Docker深度检测模块
# 功能: 深度检测Docker镜像、网络、卷、事件、系统信息等
# 作者: Claude Code
# 日期: 2026-05-09
################################################################################
# 获取脚本所在目录并加载依赖
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LIB_DIR="/tmp/check_modules"
# 加载配置文件和通用函数库
if [ -f "$LIB_DIR/config.sh" ]; then
source "$LIB_DIR/config.sh"
else
echo "ERROR: 配置文件不存在: $LIB_DIR/config.sh"
exit 1
fi
if [ -f "$LIB_DIR/common.sh" ]; then
source "$LIB_DIR/common.sh"
else
echo "ERROR: 通用函数库不存在: $LIB_DIR/common.sh"
exit 1
fi
# ==================== 深度检测函数 ====================
# 检测Docker系统信息
check_docker_system_info() {
local docker_version server_version storage_driver
docker_version=$(docker version --format '{{.Server.Version}}' 2>/dev/null)
server_version=$(docker version --format '{{.Server.APIVersion}}' 2>/dev/null)
storage_driver=$(docker info --format '{{.Driver}}' 2>/dev/null)
if [ -n "$docker_version" ]; then
output_result "DOCKER_VERSION" "$docker_version"
else
output_result "DOCKER_VERSION" "获取失败"
fi
if [ -n "$server_version" ]; then
output_result "DOCKER_API_VERSION" "$server_version"
fi
if [ -n "$storage_driver" ]; then
output_result "DOCKER_STORAGE_DRIVER" "$storage_driver"
fi
}
# 检测Docker镜像列表
check_docker_images() {
local image_count dangling_count
image_count=$(docker images -q 2>/dev/null | wc -l)
dangling_count=$(docker images -f "dangling=true" -q 2>/dev/null | wc -l)
output_result "DOCKER_IMAGES_COUNT" "$image_count"
output_result "DOCKER_IMAGES_DANGLING" "$dangling_count"
# 获取TOP5最大镜像
local top_images
top_images=$(docker images --format "{{.Repository}}:{{.Tag}}\t{{.Size}}" 2>/dev/null | sort -k2 -h | tail -5 | tr '\n' '|' | sed 's/|$//')
if [ -n "$top_images" ]; then
output_result "DOCKER_IMAGES_TOP5" "$top_images"
else
output_result "DOCKER_IMAGES_TOP5" "无"
fi
}
# 检测Docker网络详情
check_docker_network_detail() {
local network_count bridge_count overlay_count
network_count=$(docker network ls 2>/dev/null | grep -c "^" || echo "0")
network_count=$((network_count - 1))
bridge_count=$(docker network ls 2>/dev/null | grep -c "bridge" || echo "0")
overlay_count=$(docker network ls 2>/dev/null | grep -c "overlay" || echo "0")
output_result "DOCKER_NETWORK_COUNT" "$network_count"
output_result "DOCKER_NETWORK_BRIDGE" "$bridge_count"
output_result "DOCKER_NETWORK_OVERLAY" "$overlay_count"
# 获取网络列表
local network_list
network_list=$(docker network ls --format "{{.Name}}\t{{.Driver}}\t{{.Scope}}" 2>/dev/null | tr '\n' '|' | sed 's/|$//')
if [ -n "$network_list" ]; then
output_result "DOCKER_NETWORK_LIST" "$network_list"
fi
}
# 检测Docker卷
check_docker_volumes() {
local volume_count unused_count
volume_count=$(docker volume ls -q 2>/dev/null | wc -l)
output_result "DOCKER_VOLUMES_COUNT" "$volume_count"
# 获取卷列表
local volume_list
volume_list=$(docker volume ls --format "{{.Name}}\t{{.Driver}}" 2>/dev/null | tr '\n' '|' | sed 's/|$//')
if [ -n "$volume_list" ]; then
output_result "DOCKER_VOLUMES_LIST" "$volume_list"
fi
}
# 检测容器资源限制
check_container_limits() {
for key in "${!CONTAINERS[@]}"; do
local container_name="${CONTAINERS[$key]}"
if ! docker ps -a --format "{{.Names}}" | grep -q "^${container_name}$"; then
continue
fi
# 获取CPU限制
local cpu_quota cpu_period
cpu_quota=$(docker inspect --format='{{.HostConfig.CpuQuota}}' "$container_name" 2>/dev/null)
cpu_period=$(docker inspect --format='{{.HostConfig.CpuPeriod}}' "$container_name" 2>/dev/null)
if [ -n "$cpu_quota" ] && [ "$cpu_quota" != "0" ]; then
local cpu_limit=$((cpu_quota * 100 / cpu_period))
output_result "LIMIT_${container_name}_CPU" "${cpu_limit}%"
else
output_result "LIMIT_${container_name}_CPU" "无限制"
fi
# 获取内存限制
local memory_limit
memory_limit=$(docker inspect --format='{{.HostConfig.Memory}}' "$container_name" 2>/dev/null)
if [ -n "$memory_limit" ] && [ "$memory_limit" != "0" ]; then
local memory_mb=$((memory_limit / 1024 / 1024))
output_result "LIMIT_${container_name}_MEMORY" "${memory_mb}MB"
else
output_result "LIMIT_${container_name}_MEMORY" "无限制"
fi
done
}
# 检测容器进程详情
check_container_processes() {
for key in "mysql" "redis" "emqx" "java"; do
local container_name="${CONTAINERS[$key]}"
if [ -z "$container_name" ]; then
continue
fi
if ! docker ps --format "{{.Names}}" | grep -q "^${container_name}$"; then
continue
fi
# 获取容器内进程数
local process_count
process_count=$(docker exec "$container_name" ps aux 2>/dev/null | wc -l)
process_count=$((process_count - 1))
if [ -n "$process_count" ] && [ "$process_count" -gt 0 ]; then
output_result "PROCESSES_${container_name}" "$process_count"
fi
done
}
# 检测容器文件系统使用
check_container_fs_usage() {
for key in "${!CONTAINERS[@]}"; do
local container_name="${CONTAINERS[$key]}"
if ! docker ps --format "{{.Names}}" | grep -q "^${container_name}$"; then
continue
fi
# 获取容器根文件系统大小
local size_rw size_rootfs
size_rw=$(docker inspect --format='{{.SizeRw}}' "$container_name" 2>/dev/null)
size_rootfs=$(docker inspect --format='{{.SizeRootFs}}' "$container_name" 2>/dev/null)
if [ -n "$size_rw" ] && [ "$size_rw" != "0" ]; then
local rw_mb=$((size_rw / 1024 / 1024))
output_result "FS_${container_name}_RW" "${rw_mb}MB"
fi
if [ -n "$size_rootfs" ] && [ "$size_rootfs" != "0" ]; then
local rootfs_mb=$((size_rootfs / 1024 / 1024))
output_result "FS_${container_name}_ROOT" "${rootfs_mb}MB"
fi
done
}
# 检测Docker事件(最近的错误和警告)
check_docker_events() {
# 获取最近1小时的事件
local error_events warn_events
error_events=$(docker events --since 1h --until 1m --format '{{$line}}' 2>/dev/null | grep -i "error" | wc -l)
warn_events=$(docker events --since 1h --until 1m --format '{{$line}}' 2>/dev/null | grep -E "warn|kill|die" | wc -l)
output_result "DOCKER_EVENTS_ERROR" "$error_events"
output_result "DOCKER_EVENTS_WARN" "$warn_events"
if [ "$error_events" -gt 0 ]; then
output_result "DOCKER_EVENTS_LEVEL" "严重"
elif [ "$warn_events" -gt 0 ]; then
output_result "DOCKER_EVENTS_LEVEL" "警告"
else
output_result "DOCKER_EVENTS_LEVEL" "正常"
fi
}
# 检测容器重启历史
check_container_restart_history() {
for key in "${!CONTAINERS[@]}"; do
local container_name="${CONTAINERS[$key]}"
if ! docker ps -a --format "{{.Names}}" | grep -q "^${container_name}$"; then
continue
fi
# 获取重启次数
local restart_count
restart_count=$(docker inspect --format='{{.RestartCount}}' "$container_name" 2>/dev/null)
if [ -n "$restart_count" ]; then
output_result "RESTART_COUNT_${container_name}" "$restart_count"
fi
# 获取上次重启时间
local restarted_at
restarted_at=$(docker inspect --format='{{.State.StartedAt}}' "$container_name" 2>/dev/null)
if [ -n "$restarted_at" ]; then
output_result "RESTART_TIME_${container_name}" "$restarted_at"
fi
done
}
# 检测Docker守护进程资源
check_docker_daemon_resources() {
# 获取Docker守护进程的CPU和内存使用
local docker_pid
docker_pid=$(pgrep dockerd | head -1)
if [ -n "$docker_pid" ]; then
local docker_cpu docker_mem
docker_cpu=$(ps -p "$docker_pid" -o %cpu --no-headers 2>/dev/null | tr -d ' ')
docker_mem=$(ps -p "$docker_pid" -o %mem --no-headers 2>/dev/null | tr -d ' ')
if [ -n "$docker_cpu" ]; then
output_result "DOCKER_DAEMON_CPU" "${docker_cpu}%"
fi
if [ -n "$docker_mem" ]; then
output_result "DOCKER_DAEMON_MEM" "${docker_mem}%"
fi
fi
}
# 检测容器挂载点
check_container_mounts() {
for key in "${!CONTAINERS[@]}"; do
local container_name="${CONTAINERS[$key]}"
if ! docker ps --format "{{.Names}}" | grep -q "^${container_name}$"; then
continue
fi
# 获取挂载点数量
local mount_count
mount_count=$(docker inspect --format='{{len .Mounts}}' "$container_name" 2>/dev/null)
if [ -n "$mount_count" ] && [ "$mount_count" -gt 0 ]; then
output_result "MOUNTS_${container_name}_COUNT" "$mount_count"
# 获取挂载点列表
local mounts
mounts=$(docker inspect --format='{{range .Mounts}}{{.Source}}:{{.Destination}}|{{end}}' "$container_name" 2>/dev/null | sed 's/|$//')
if [ -n "$mounts" ]; then
output_result "MOUNTS_${container_name}_LIST" "$mounts"
fi
fi
done
}
# 检测容器环境变量(安全检查)
check_container_env() {
# 检查是否有明文密码等敏感信息
for key in "mysql" "redis"; do
local container_name="${CONTAINERS[$key]}"
if [ -z "$container_name" ]; then
continue
fi
if ! docker ps --format "{{.Names}}" | grep -q "^${container_name}$"; then
continue
fi
# 获取环境变量数量
local env_count
env_count=$(docker inspect --format='{{len .Config.Env}}' "$container_name" 2>/dev/null)
if [ -n "$env_count" ]; then
output_result "ENV_${container_name}_COUNT" "$env_count"
fi
done
}
# 检测Docker构建缓存
check_docker_build_cache() {
# 获取构建缓存大小(通过docker system df)
local build_cache
build_cache=$(docker system df --format "{{.BuildCache}}" 2>/dev/null | grep -v "Build Cache" | head -1)
if [ -n "$build_cache" ]; then
output_result "DOCKER_BUILD_CACHE" "$build_cache"
fi
}
# ==================== 主检测流程 ====================
main() {
log_info "开始Docker深度检测..."
# 检查Docker是否可用
if ! command -v docker &> /dev/null; then
log_error "Docker未安装"
return 1
fi
# 执行各项深度检测
check_docker_system_info
check_docker_images
check_docker_network_detail
check_docker_volumes
check_container_limits
check_container_processes
check_container_fs_usage
check_docker_events
check_container_restart_history
check_docker_daemon_resources
check_container_mounts
check_container_env
check_docker_build_cache
log_info "Docker深度检测完成"
}
# 执行主函数
main
#!/bin/bash
################################################################################
# EMQX深度检测模块
# 功能: 深度检测EMQX连接、订阅、消息统计、集群、性能等
# 作者: Claude Code
# 日期: 2026-05-09
################################################################################
# 获取脚本所在目录并加载依赖
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LIB_DIR="/tmp/check_modules"
# 加载配置文件和通用函数库
if [ -f "$LIB_DIR/config.sh" ]; then
source "$LIB_DIR/config.sh"
else
echo "ERROR: 配置文件不存在: $LIB_DIR/config.sh"
exit 1
fi
if [ -f "$LIB_DIR/common.sh" ]; then
source "$LIB_DIR/common.sh"
else
echo "ERROR: 通用函数库不存在: $LIB_DIR/common.sh"
exit 1
fi
# EMQX容器名称
EMQX_CONTAINER="${CONTAINERS[emqx]}"
# ==================== 辅助函数 ====================
# 在EMQX容器中执行命令
emqx_exec() {
docker exec "$EMQX_CONTAINER" $@ 2>/dev/null
}
# ==================== 深度检测函数 ====================
# 检测EMQX详细状态
check_emqx_status_detail() {
local broker_status node_status
broker_status=$(emqx_exec emqx_ctl broker 2>/dev/null)
node_status=$(emqx_exec emqx_ctl status 2>/dev/null)
if [ -n "$broker_status" ]; then
# 提取版本信息
local version
version=$(echo "$broker_status" | grep -i "version" | head -1 | awk '{for(i=2;i<=NF;i++)printf $i" "}')
if [ -n "$version" ]; then
output_result "EMQX_BROKER_VERSION" "$version"
fi
fi
# 节点状态基于容器状态,不再重复检查
# 容器状态已在基础检测中确认,这里默认正常
if [ -n "$node_status" ]; then
output_result "EMQX_NODE_STATUS" "运行中"
output_result "EMQX_NODE_LEVEL" "正常"
fi
}
# 检测EMQX客户端详情
check_emqx_clients_detail() {
# 获取客户端总数
local client_count
client_count=$(emqx_exec emqx_ctl clients list 2>/dev/null | wc -l)
if [ -n "$client_count" ]; then
output_result "EMQX_CLIENTS_TOTAL" "$client_count"
fi
# 获取连接状态统计
local clients_connected clients_disconnected
clients_connected=$(emqx_exec emqx_ctl clients list 2>/dev/null | grep -c "connected" || echo "0")
if [ -n "$clients_connected" ]; then
output_result "EMQX_CLIENTS_CONNECTED" "$clients_connected"
clients_disconnected=$((client_count - clients_connected))
output_result "EMQX_CLIENTS_DISCONNECTED" "$clients_disconnected"
fi
# 检查最大连接数
local max_clients
max_clients=$(emqx_exec emqx_ctl listeners list 2>/dev/null | grep -c "running" || echo "0")
if [ -n "$max_clients" ]; then
output_result "EMQX_MAX_CLIENTS" "$max_clients"
fi
}
# 检测EMQX订阅详情
check_emqx_subscriptions_detail() {
# 获取订阅总数
local sub_count
sub_count=$(emqx_exec emqx_ctl subscriptions list 2>/dev/null | wc -l)
if [ -n "$sub_count" ]; then
output_result "EMQX_SUBSCRIPTIONS_TOTAL" "$sub_count"
fi
# 统计各主题订阅数
local top_topics
top_topics=$(emqx_exec emqx_ctl subscriptions list 2>/dev/null | awk '{print $2}' | sort | uniq -c | sort -rn | head -5 | tr '\n' '|' | sed 's/|$//')
if [ -n "$top_topics" ]; then
output_result "EMQX_TOP_TOPICS" "$top_topics"
fi
}
# 检测EMQX消息统计
check_emqx_messages_stats() {
# 获取消息统计
local messages_stats
messages_stats=$(emqx_exec emqx_ctl metrics 2>/dev/null | grep -E "messages" || echo "")
if [ -n "$messages_stats" ]; then
# 提取关键指标
local messages_sent messages_received messages_qos
messages_sent=$(echo "$messages_stats" | grep "messages.sent" | awk '{print $2}' || echo "0")
messages_received=$(echo "$messages_stats" | grep "messages.received" | awk '{print $2}' || echo "0")
if [ -n "$messages_sent" ]; then
output_result "EMQX_MESSAGES_SENT" "$messages_sent"
fi
if [ -n "$messages_received" ]; then
output_result "EMQX_MESSAGES_RECEIVED" "$messages_received"
fi
fi
# 获取消息速率
local messages_rate
messages_rate=$(emqx_exec emqx_ctl metrics 2>/dev/null | grep -E "bytes\|messages" || echo "")
if [ -n "$messages_rate" ]; then
output_result "EMQX_METRICS" "$(echo "$messages_rate" | tr '\n' ' ' | sed 's/ $//')"
fi
}
# 检测EMQX主题详情
check_emqx_topics_detail() {
# 获取主题总数
local topic_count
topic_count=$(emqx_exec emqx_ctl topics list 2>/dev/null | wc -l)
if [ -n "$topic_count" ]; then
output_result "EMQX_TOPICS_TOTAL" "$topic_count"
fi
# 获取TOP5主题
local top5_topics
top5_topics=$(emqx_exec emqx_ctl topics list 2>/dev/null | head -6 | tr '\n' '|' | sed 's/|$//')
if [ -n "$top5_topics" ]; then
output_result "EMQX_TOP5_TOPICS" "$top5_topics"
fi
}
# 检测EMQX路由详情
check_emqx_routes_detail() {
# 获取路由总数
local route_count
route_count=$(emqx_exec emqx_ctl routes list 2>/dev/null | wc -l)
if [ -n "$route_count" ]; then
output_result "EMQX_ROUTES_TOTAL" "$route_count"
fi
}
# 检测EMQX监听器详情
check_emqx_listeners_detail() {
# 获取监听器列表
local listeners
listeners=$(emqx_exec emqx_ctl listeners list 2>/dev/null)
if [ -n "$listeners" ]; then
# 统计监听器数量
local listener_count
listener_count=$(echo "$listeners" | grep -c "running" || echo "0")
if [ -n "$listener_count" ]; then
output_result "EMQX_LISTENERS_COUNT" "$listener_count"
fi
# 提取监听端口信息
local listener_ports
listener_ports=$(echo "$listeners" | grep "running" | awk '{print $4}' | tr '\n' ',' | sed 's/,$//')
if [ -n "$listener_ports" ]; then
output_result "EMQX_LISTENER_PORTS" "$listener_ports"
fi
fi
}
# 检测EMQX集群状态
check_emqx_cluster_detail() {
# 获取集群状态
local cluster_status
cluster_status=$(emqx_exec emqx_ctl cluster status 2>/dev/null)
if [ -n "$cluster_status" ]; then
# 检查是否为集群模式
local is_cluster
is_cluster=$(echo "$cluster_status" | grep -c "Cluster" || echo "0")
if [ "$is_cluster" -gt 0 ]; then
output_result "EMQX_CLUSTER_MODE" "是"
# 获取集群节点数量
local node_count
node_count=$(echo "$cluster_status" | grep -c "Node" || echo "0")
if [ -n "$node_count" ]; then
output_result "EMQX_CLUSTER_NODES" "$node_count"
fi
else
output_result "EMQX_CLUSTER_MODE" "否"
output_result "EMQX_CLUSTER_NODES" "1"
fi
fi
}
# 检测EMQX连接限制
check_emqx_connection_limits() {
# 获取最大连接数配置
local max_connections
max_connections=$(emqx_exec emqx_ctl listeners list 2>/dev/null | grep "max_connections" | awk '{print $NF}' | head -1)
if [ -n "$max_connections" ]; then
output_result "EMQX_MAX_CONNECTIONS" "$max_connections"
fi
# 获取当前连接数
local current_connections
current_connections=$(emqx_exec emqx_ctl clients list 2>/dev/null | wc -l)
if [ -n "$current_connections" ] && [ -n "$max_connections" ]; then
local conn_percent
conn_percent=$(awk "BEGIN {printf \"%.1f\", ($current_connections * 100) / $max_connections}")
output_result "EMQX_CONN_USAGE" "${conn_percent}%"
# 判断连接使用率
local level="正常"
if (( $(awk "BEGIN {print ($conn_percent > 90)}") )); then
level="严重"
elif (( $(awk "BEGIN {print ($conn_percent > 80)}") )); then
level="警告"
fi
output_result "EMQX_CONN_LEVEL" "$level"
fi
}
# 检测EMQX内存使用
check_emqx_memory() {
# 获取EMQX进程内存使用
local emqx_pid
emqx_pid=$(docker exec "$EMQX_CONTAINER" ps aux | grep "beam.smp" | grep -v grep | awk '{print $2}' | head -1)
if [ -n "$emqx_pid" ]; then
local mem_usage
mem_usage=$(docker exec "$EMQX_CONTAINER" ps -p "$emqx_pid" -o %mem --no-headers 2>/dev/null | tr -d ' ')
if [ -n "$mem_usage" ]; then
output_result "EMQX_MEMORY_USAGE" "${mem_usage}%"
fi
fi
}
# 检测EMQX会话详情
check_emqx_sessions_detail() {
# 获取会话总数
local session_count
session_count=$(emqx_exec emqx_ctl sessions list 2>/dev/null | wc -l)
if [ -n "$session_count" ]; then
output_result "EMQX_SESSIONS_TOTAL" "$session_count"
fi
# 统计会话状态
local sessions_active sessions_inactive
sessions_active=$(emqx_exec emqx_ctl sessions list 2>/dev/null | grep -c "active" || echo "0")
if [ -n "$sessions_active" ]; then
output_result "EMQX_SESSIONS_ACTIVE" "$sessions_active"
sessions_inactive=$((session_count - sessions_active))
output_result "EMQX_SESSIONS_INACTIVE" "$sessions_inactive"
fi
}
# 检测EMQX插件状态
check_emqx_plugins() {
# 获取已加载的插件列表
local plugins
plugins=$(emqx_exec emqx_ctl plugins list 2>/dev/null | grep "loaded" | awk '{print $1}' | tr '\n' ',' | sed 's/,$//')
if [ -n "$plugins" ]; then
output_result "EMQX_PLUGINS_LOADED" "$plugins"
fi
# 统计插件数量
local plugin_count
plugin_count=$(emqx_exec emqx_ctl plugins list 2>/dev/null | grep -c "loaded" || echo "0")
if [ -n "$plugin_count" ]; then
output_result "EMQX_PLUGINS_COUNT" "$plugin_count"
fi
}
# 检测EMQX规则引擎状态
check_emqx_rules() {
# 获取规则数量(如果有规则引擎)
local rules_count
rules_count=$(emqx_exec emqx_ctl rules list 2>/dev/null | wc -l)
if [ -n "$rules_count" ] && [ "$rules_count" -gt 0 ]; then
output_result "EMQX_RULES_COUNT" "$rules_count"
else
output_result "EMQX_RULES_COUNT" "未启用"
fi
}
# 检测EMQX告警状态
check_emqx_alarms() {
# 获取告警信息
local alarms
alarms=$(emqx_exec emqx_ctl alarms list 2>/dev/null)
if [ -n "$alarms" ]; then
# 统计告警数量
local alarm_count
alarm_count=$(echo "$alarms" | wc -l)
if [ "$alarm_count" -gt 0 ]; then
output_result "EMQX_ALARMS_COUNT" "$alarm_count"
output_result "EMQX_ALARMS_LEVEL" "警告"
# 获取告警详情
local alarm_detail
alarm_detail=$(echo "$alarms" | head -5 | tr '\n' '|' | sed 's/|$//')
output_result "EMQX_ALARMS_DETAIL" "$alarm_detail"
else
output_result "EMQX_ALARMS_COUNT" "0"
output_result "EMQX_ALARMS_LEVEL" "正常"
fi
else
output_result "EMQX_ALARMS_COUNT" "0"
output_result "EMQX_ALARMS_LEVEL" "正常"
fi
}
# 检测EMQX性能指标
check_emqx_performance() {
# 获取各种性能指标
local metrics
metrics=$(emqx_exec emqx_ctl metrics 2>/dev/null)
if [ -n "$metrics" ]; then
# 提取关键性能指标
local conn_pkt recv_pkt send_pkt
conn_pkt=$(echo "$metrics" | grep "packets.connect" | awk '{print $2}' || echo "0")
recv_pkt=$(echo "$metrics" | grep "packets.received" | awk '{print $2}' || echo "0")
send_pkt=$(echo "$metrics" | grep "packets.sent" | awk '{print $2}' || echo "0")
if [ -n "$conn_pkt" ]; then
output_result "EMQX_PACKETS_CONN" "$conn_pkt"
fi
if [ -n "$recv_pkt" ]; then
output_result "EMQX_PACKETS_RECV" "$recv_pkt"
fi
if [ -n "$send_pkt" ]; then
output_result "EMQX_PACKETS_SENT" "$send_pkt"
fi
fi
}
# ==================== 主检测流程 ====================
main() {
log_info "开始EMQX深度检测..."
# 检查容器状态
if ! docker ps --format "{{.Names}}" | grep -q "^${EMQX_CONTAINER}$"; then
log_warn "EMQX容器未运行"
output_result "EMQX_CONTAINER_STATUS" "未运行"
return 1
fi
output_result "EMQX_CONTAINER_STATUS" "运行中"
# 执行各项深度检测(添加错误处理,确保部分失败不影响其他检测)
check_emqx_status_detail 2>/dev/null || true
check_emqx_clients_detail 2>/dev/null || true
check_emqx_subscriptions_detail 2>/dev/null || true
check_emqx_messages_stats 2>/dev/null || true
check_emqx_topics_detail 2>/dev/null || true
check_emqx_routes_detail 2>/dev/null || true
check_emqx_listeners_detail 2>/dev/null || true
check_emqx_cluster_detail 2>/dev/null || true
check_emqx_connection_limits 2>/dev/null || true
check_emqx_memory 2>/dev/null || true
check_emqx_sessions_detail 2>/dev/null || true
check_emqx_plugins 2>/dev/null || true
check_emqx_rules 2>/dev/null || true
check_emqx_alarms 2>/dev/null || true
check_emqx_performance 2>/dev/null || true
log_info "EMQX深度检测完成"
}
# 执行主函数
main
#!/bin/bash
################################################################################
# 综合诊断检测模块
# 功能: 汇总所有检测异常,进行核心问题诊断和历史对比分析
# 作者: Claude Code
# 日期: 2026-05-09
################################################################################
# 获取脚本所在目录并加载依赖
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LIB_DIR="/tmp/check_modules"
# 加载配置文件和通用函数库
if [ -f "$LIB_DIR/config.sh" ]; then
source "$LIB_DIR/config.sh"
else
echo "ERROR: 配置文件不存在: $LIB_DIR/config.sh"
exit 1
fi
if [ -f "$LIB_DIR/common.sh" ]; then
source "$LIB_DIR/common.sh"
else
echo "ERROR: 通用函数库不存在: $LIB_DIR/common.sh"
exit 1
fi
# 历史数据存储路径
HISTORY_DIR="$LIB_DIR/history"
CURRENT_DATA_FILE="$LIB_DIR/current_data.txt"
# ==================== 辅助函数 ====================
# 从其他模块的输出中提取关键指标
extract_metric() {
local key=$1
local file=$2
if [ -f "$file" ]; then
grep "^${key}:" "$file" 2>/dev/null | cut -d: -f2- | tr -d ' '
fi
}
# ==================== 核心问题诊断函数 ====================
# 诊断系统负载
diagnose_system_load() {
local load1=$(extract_metric "1分钟负载" "$CURRENT_DATA_FILE")
if [ -n "$load1" ]; then
local load_value=$(echo "$load1" | awk '{print $1}')
local level="正常"
if (( $(awk "BEGIN {print ($load_value > 16)}") )); then
level="严重"
echo "ERROR:系统负载过高: ${load1}"
elif (( $(awk "BEGIN {print ($load_value > 8)}") )); then
level="警告"
echo "WARN:系统负载较高: ${load1}"
fi
output_result "DIAG_LOAD_1MIN" "$load1"
output_result "DIAG_LOAD_LEVEL" "$level"
fi
}
# 诊断内存压力
diagnose_memory_pressure() {
local mem_usage=$(extract_metric "内存使用率" "$CURRENT_DATA_FILE")
if [ -n "$mem_usage" ]; then
local mem_value=$(echo "$mem_usage" | tr -d '%' | awk '{print $1}')
local level="正常"
if [ -n "$mem_value" ]; then
if (( $(awk "BEGIN {print ($mem_value > 95)}") )); then
level="严重"
echo "ERROR:内存使用率过高: ${mem_usage}"
elif (( $(awk "BEGIN {print ($mem_value > 85)}") )); then
level="警告"
echo "WARN:内存使用率较高: ${mem_usage}"
fi
fi
output_result "DIAG_MEMORY_USAGE" "$mem_usage"
output_result "DIAG_MEMORY_LEVEL" "$level"
fi
}
# 诊断Swap使用
diagnose_swap_usage() {
local swap_usage=$(extract_metric "SWAP_USAGE" "$CURRENT_DATA_FILE")
if [ -n "$swap_usage" ]; then
local swap_value=$(echo "$swap_usage" | tr -d '%' | awk '{print $1}')
local level="正常"
if [ -n "$swap_value" ]; then
if (( $(awk "BEGIN {print ($swap_value > 20)}") )); then
level="严重"
echo "ERROR:Swap使用率过高: ${swap_usage}"
elif (( $(awk "BEGIN {print ($swap_value > 0)}") )); then
level="警告"
echo "WARN:存在Swap使用: ${swap_usage}"
fi
fi
output_result "DIAG_SWAP_USAGE" "$swap_usage"
output_result "DIAG_SWAP_LEVEL" "$level"
fi
}
# 诊断磁盘IO
diagnose_disk_io() {
local disk_io_status=$(extract_metric "DISK_IO_STATUS" "$CURRENT_DATA_FILE")
if [ -n "$disk_io_status" ]; then
output_result "DIAG_DISK_IO" "$disk_io_status"
if [ "$disk_io_status" != "正常" ] && [ "$disk_io_status" != "未安装iostat" ]; then
output_result "DIAG_DISK_LEVEL" "警告"
echo "WARN:磁盘IO异常: ${disk_io_status}"
else
output_result "DIAG_DISK_LEVEL" "正常"
fi
fi
}
# 诊断MySQL慢查询
diagnose_mysql_slow() {
local slow_queries=$(extract_metric "MYSQL_SLOW_QUERIES" "$CURRENT_DATA_FILE")
if [ -n "$slow_queries" ]; then
local slow_value=$(echo "$slow_queries" | awk '{print $1}')
local level="正常"
if [ -n "$slow_value" ]; then
if (( $(awk "BEGIN {print ($slow_value > 1000000)}") )); then
level="严重"
echo "ERROR:MySQL慢查询过多: ${slow_queries}"
elif (( $(awk "BEGIN {print ($slow_value > 100)}") )); then
level="警告"
echo "WARN:MySQL慢查询较多: ${slow_queries}"
fi
fi
output_result "DIAG_MYSQL_SLOW" "$slow_queries"
output_result "DIAG_MYSQL_SLOW_LEVEL" "$level"
fi
}
# 诊断僵尸进程
diagnose_zombie() {
local zombie_count=$(extract_metric "ZOMBIE_COUNT" "$CURRENT_DATA_FILE")
if [ -n "$zombie_count" ]; then
local zombie_value=$(echo "$zombie_count" | awk '{print $1}')
local level="正常"
if [ -n "$zombie_value" ]; then
if (( $(awk "BEGIN {print ($zombie_value > 5)}") )); then
level="严重"
echo "ERROR:僵尸进程过多: ${zombie_count}"
elif (( $(awk "BEGIN {print ($zombie_value > 0)}") )); then
level="警告"
echo "WARN:存在僵尸进程: ${zombie_count}"
fi
fi
output_result "DIAG_ZOMBIE" "$zombie_count"
output_result "DIAG_ZOMBIE_LEVEL" "$level"
fi
}
# 诊断TIME_WAIT连接
diagnose_timewait() {
local timewait=$(extract_metric "TCP_TIME_WAIT" "$CURRENT_DATA_FILE")
if [ -n "$timewait" ]; then
local timewait_value=$(echo "$timewait" | awk '{print $1}')
local level="正常"
if [ -n "$timewait_value" ]; then
if (( $(awk "BEGIN {print ($timewait_value > 5000)}") )); then
level="严重"
echo "ERROR:TIME_WAIT连接过多: ${timewait}"
elif (( $(awk "BEGIN {print ($timewait_value > 500)}") )); then
level="警告"
echo "WARN:TIME_WAIT连接较多: ${timewait}"
fi
fi
output_result "DIAG_TIMEWAIT" "$timewait"
output_result "DIAG_TIMEWAIT_LEVEL" "$level"
fi
}
# ==================== 历史对比函数 ====================
# 保存当前检测数据
save_current_data() {
local timestamp=$(date +%s)
local data_file="$HISTORY_DIR/${timestamp}.data"
mkdir -p "$HISTORY_DIR" 2>/dev/null
# 复制当前数据文件
if [ -f "$CURRENT_DATA_FILE" ]; then
cp "$CURRENT_DATA_FILE" "$data_file" 2>/dev/null
output_result "HISTORY_SAVED" "$timestamp"
# 清理旧数据(保留最近10次)
ls -t "$HISTORY_DIR"/*.data 2>/dev/null | tail -n +11 | xargs rm -f 2>/dev/null
fi
}
# 加载上次检测数据
load_last_data() {
local last_file=$(ls -t "$HISTORY_DIR"/*.data 2>/dev/null | head -2 | tail -1)
if [ -f "$last_file" ]; then
echo "$last_file"
else
echo ""
fi
}
# 计算指标变化
calculate_change() {
local current=$1
local last=$2
local metric_type=$3
if [ -z "$current" ] || [ -z "$last" ]; then
echo "N/A"
return
fi
# 提取数值
local current_val=$(echo "$current" | grep -oE '[0-9]+\.?[0-9]*' | head -1)
local last_val=$(echo "$last" | grep -oE '[0-9]+\.?[0-9]*' | head -1)
if [ -z "$current_val" ] || [ -z "$last_val" ]; then
echo "N/A"
return
fi
# 使用awk统一计算变化和趋势
local result=$(awk -v curr="$current_val" -v last="$last_val" -v type="$metric_type" 'BEGIN {
change = curr - last
trend = "持平"
if (type == "percent") {
abs_change = (change < 0 ? -change : change)
if (abs_change > 5) {
if (change > 0) {
trend = "🔴 上升"
} else {
trend = "🟢 下降"
}
}
printf "%.1f%%|%s", change, trend
} else {
if (change > 0) {
trend = "🔴 增加"
} else if (change < 0) {
trend = "🟢 减少"
}
printf "%.0f%%|%s", change, trend
}
}')
echo "$result"
}
# 对比CPU使用率
compare_cpu() {
local last_file=$(load_last_data)
if [ -z "$last_file" ]; then
output_result "COMPARE_CPU" "首次检测,无历史数据"
return
fi
local current_cpu=$(extract_metric "CPU使用率" "$CURRENT_DATA_FILE")
local last_cpu=$(extract_metric "CPU使用率" "$last_file")
if [ -n "$current_cpu" ] && [ -n "$last_cpu" ]; then
local change=$(calculate_change "$current_cpu" "$last_cpu" "percent")
output_result "COMPARE_CPU" "上次: $last_cpu | 本次: $current_cpu | 变化: $change"
fi
}
# 对比内存使用率
compare_memory() {
local last_file=$(load_last_data)
if [ -z "$last_file" ]; then
output_result "COMPARE_MEMORY" "首次检测,无历史数据"
return
fi
local current_mem=$(extract_metric "内存使用率" "$CURRENT_DATA_FILE")
local last_mem=$(extract_metric "内存使用率" "$last_file")
if [ -n "$current_mem" ] && [ -n "$last_mem" ]; then
local change=$(calculate_change "$current_mem" "$last_mem" "percent")
output_result "COMPARE_MEMORY" "上次: $last_mem | 本次: $current_mem | 变化: $change"
fi
}
# 对比磁盘使用率
compare_disk() {
local last_file=$(load_last_data)
if [ -z "$last_file" ]; then
output_result "COMPARE_DISK" "首次检测,无历史数据"
return
fi
local current_disk=$(extract_metric "DISK_USAGE_ROOT" "$CURRENT_DATA_FILE")
local last_disk=$(extract_metric "DISK_USAGE_ROOT" "$last_file")
if [ -n "$current_disk" ] && [ -n "$last_disk" ]; then
local change=$(calculate_change "$current_disk" "$last_disk" "percent")
output_result "COMPARE_DISK" "上次: $last_disk | 本次: $current_disk | 变化: $change"
fi
}
# 对比线程总数
compare_threads() {
local last_file=$(load_last_data)
if [ -z "$last_file" ]; then
output_result "COMPARE_THREADS" "首次检测,无历史数据"
return
fi
local current_threads=$(extract_metric "THREAD_COUNT" "$CURRENT_DATA_FILE")
local last_threads=$(extract_metric "THREAD_COUNT" "$last_file")
if [ -n "$current_threads" ] && [ -n "$last_threads" ]; then
local change=$(calculate_change "$current_threads" "$last_threads" "count")
output_result "COMPARE_THREADS" "上次: $last_threads | 本次: $current_threads | 变化: $change"
fi
}
# 对比TCP连接
compare_connections() {
local last_file=$(load_last_data)
if [ -z "$last_file" ]; then
output_result "COMPARE_CONNECTIONS" "首次检测,无历史数据"
return
fi
local current_estab=$(extract_metric "TCP_ESTABLISHED" "$CURRENT_DATA_FILE")
local last_estab=$(extract_metric "TCP_ESTABLISHED" "$last_file")
if [ -n "$current_estab" ] && [ -n "$last_estab" ]; then
local change=$(calculate_change "$current_estab" "$last_estab" "count")
output_result "COMPARE_TCP_ESTAB" "上次: $last_estab | 本次: $current_estab | 变化: $change"
fi
local current_timewait=$(extract_metric "TCP_TIME_WAIT" "$CURRENT_DATA_FILE")
local last_timewait=$(extract_metric "TCP_TIME_WAIT" "$last_file")
if [ -n "$current_timewait" ] && [ -n "$last_timewait" ]; then
local change=$(calculate_change "$current_timewait" "$last_timewait" "count")
output_result "COMPARE_TCP_TIMEWAIT" "上次: $last_timewait | 本次: $current_timewait | 变化: $change"
fi
}
# 对比MySQL连接数
compare_mysql() {
local last_file=$(load_last_data)
if [ -z "$last_file" ]; then
output_result "COMPARE_MYSQL_CONN" "首次检测,无历史数据"
return
fi
local current_conn=$(extract_metric "MYSQL_CONNECTIONS_CURRENT" "$CURRENT_DATA_FILE")
local last_conn=$(extract_metric "MYSQL_CONNECTIONS_CURRENT" "$last_file")
if [ -n "$current_conn" ] && [ -n "$last_conn" ]; then
local change=$(calculate_change "$current_conn" "$last_conn" "count")
output_result "COMPARE_MYSQL_CONN" "上次: $last_conn | 本次: $current_conn | 变化: $change"
fi
local current_slow=$(extract_metric "MYSQL_SLOW_QUERIES" "$CURRENT_DATA_FILE")
local last_slow=$(extract_metric "MYSQL_SLOW_QUERIES" "$last_file")
if [ -n "$current_slow" ] && [ -n "$last_slow" ]; then
local diff=$((current_slow - last_slow))
local trend="持平"
if [ "$diff" -gt 0 ]; then
trend="🔴 增加${diff}"
elif [ "$diff" -lt 0 ]; then
trend="🟢 减少${((-diff))}"
fi
output_result "COMPARE_MYSQL_SLOW" "上次: $last_slow | 本次: $current_slow | 变化: $trend"
fi
}
# 对比Redis键总数
compare_redis() {
local last_file=$(load_last_data)
if [ -z "$last_file" ]; then
output_result "COMPARE_REDIS_KEYS" "首次检测,无历史数据"
return
fi
local current_keys=$(extract_metric "REDIS_KEYS" "$CURRENT_DATA_FILE")
local last_keys=$(extract_metric "REDIS_KEYS" "$last_file")
if [ -n "$current_keys" ] && [ -n "$last_keys" ]; then
local change=$(calculate_change "$current_keys" "$last_keys" "count")
output_result "COMPARE_REDIS_KEYS" "上次: $last_keys | 本次: $current_keys | 变化: $change"
fi
local current_clients=$(extract_metric "REDIS_CLIENTS" "$CURRENT_DATA_FILE")
local last_clients=$(extract_metric "REDIS_CLIENTS" "$last_file")
if [ -n "$current_clients" ] && [ -n "$last_clients" ]; then
local change=$(calculate_change "$current_clients" "$last_clients" "count")
output_result "COMPARE_REDIS_CLIENTS" "上次: $last_clients | 本次: $current_clients | 变化: $change"
fi
}
# 对比应用错误数
compare_app_errors() {
local last_file=$(load_last_data)
if [ -z "$last_file" ]; then
output_result "COMPARE_APP_ERRORS" "首次检测,无历史数据"
return
fi
local current_errors=$(extract_metric "APP_LOG_ERRORS_24H" "$CURRENT_DATA_FILE")
local last_errors=$(extract_metric "APP_LOG_ERRORS_24H" "$last_file")
if [ -n "$current_errors" ] && [ -n "$last_errors" ]; then
local diff=$((current_errors - last_errors))
local trend="持平"
if [ "$diff" -gt 0 ]; then
trend="🔴 新增${diff}个"
elif [ "$diff" -lt 0 ]; then
trend="🟢 减少${((-diff))}个"
fi
output_result "COMPARE_APP_ERRORS" "上次: $last_errors | 本次: $current_errors | 变化: $trend"
fi
}
# 获取上次检测时间
get_last_check_time() {
local last_file=$(load_last_data)
if [ -n "$last_file" ]; then
local timestamp=$(basename "$last_file" .data)
local date_str=$(date -d @"$timestamp" "+%Y-%m-%d %H:%M:%S" 2>/dev/null)
echo "$date_str"
else
echo "无历史记录"
fi
}
# ==================== 主检测流程 ====================
main() {
log_info "开始综合诊断检测..."
# 检查当前数据文件是否存在
if [ ! -f "$CURRENT_DATA_FILE" ]; then
log_warn "当前检测数据文件不存在,跳过对比分析"
output_result "COMPREHENSIVE_STATUS" "数据缺失"
return 1
fi
# 创建历史目录
mkdir -p "$HISTORY_DIR" 2>/dev/null
# 执行核心问题诊断
log_info "执行核心问题诊断..."
diagnose_system_load
diagnose_memory_pressure
diagnose_swap_usage
diagnose_disk_io
diagnose_mysql_slow
diagnose_zombie
diagnose_timewait
# 保存当前数据作为历史
save_current_data
# 执行历史对比分析
log_info "执行历史对比分析..."
local last_time=$(get_last_check_time)
output_result "LAST_CHECK_TIME" "$last_time"
compare_cpu
compare_memory
compare_disk
compare_threads
compare_connections
compare_mysql
compare_redis
compare_app_errors
output_result "COMPREHENSIVE_STATUS" "完成"
log_info "综合诊断检测完成"
}
# 执行主函数
main
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论