提交 e38a4884 authored 作者: 陈泽健's avatar 陈泽健

feat(system): 添加系统性能监控新功能

- 新增CPU调度器阻塞进程检测功能
- 添加进程可执行文件路径和孤儿进程统计
- 集成系统错误日志和OOM killer详细检测
- 扩展PowerShell脚本中的监控指标和阈值设置
- 优化长列表项过滤逻辑并保留关键进程信息
上级 cd877ad5
......@@ -750,6 +750,17 @@ function Get-DisplayName {
"MYSQL_CERT_LEVEL" = "MySQL证书等级"
"SYSTEM_UPTIME_DAYS" = "系统运行天数"
"HWCLOCK_STATUS" = "硬件时钟状态"
# 新增系统基础检测项
"SCHEDULER_PROCS_RUNNING" = "调度器可运行进程数"
"SCHEDULER_BLOCKED_STATUS" = "调度器阻塞状态"
"PROCESS_EXE_PATHS_TOP10" = "进程可执行路径TOP10"
"PROCESS_ORPHAN_COUNT" = "孤儿进程数"
"PROCESS_ORPHAN_STATUS" = "孤儿进程状态"
"JOURNAL_ERROR_COUNT" = "系统错误数量"
"JOURNAL_ERROR_LEVEL" = "系统错误等级"
"RECENT_SYSTEM_ERRORS" = "最近系统错误"
"OOM_LOG_DETAILS" = "OOM日志详情"
}
if ($displayNames.ContainsKey($Key)) {
......@@ -1072,6 +1083,14 @@ function Get-Threshold {
"RAID_STATUS" = "degraded"
"MEMORY_PRESSURE_AVG10" = ">1"
"VM_PGMajFAULT" = ">100"
# ==================== 新增系统基础检测阈值 ====================
"SCHEDULER_PROCS_RUNNING" = ">100"
"SCHEDULER_BLOCKED_STATUS" = "严重"
"PROCESS_ORPHAN_COUNT" = ">10"
"PROCESS_ORPHAN_STATUS" = "警告"
"JOURNAL_ERROR_COUNT" = ">50"
"JOURNAL_ERROR_LEVEL" = ">50"
}
if ($thresholds.ContainsKey($Key)) {
......@@ -1440,9 +1459,12 @@ function New-MarkdownReport {
continue
}
# 跳过长列表内容项
# 跳过长列表内容项(但保留重要的进程检测项)
if ($item.Name -match "(TOP5|TOP10|TOP20|_LIST|_DETAIL|_DISTRIBUTION|_STATS|_CONFIG|_INFO|TOPICS|TOP1)$") {
continue
# 排除例外项:进程可执行路径、孤儿进程、最近系统错误
if ($item.Name -notmatch "^(PROCESS_EXE_PATHS|RECENT_SYSTEM_ERRORS|PROCESS_ORPHAN)") {
continue
}
}
# 跳过纯配置类长列表项(但保留重要的状态和统计)
......
......@@ -209,6 +209,30 @@ check_cpu_affinity() {
fi
}
# 检测CPU调度器阻塞进程数
check_scheduler_blocked() {
local runnables=0
local blocked_status="正常"
# 从/proc/stat获取可运行进程数
runnables=$(awk '/procs_running/ {print $2}' /proc/stat 2>/dev/null)
if [ -n "$runnables" ]; then
output_result "SCHEDULER_PROCS_RUNNING" "$runnables"
# 判断状态(可运行进程过多表示调度器压力大)
if [ "$runnables" -gt 100 ]; then
blocked_status="严重"
elif [ "$runnables" -gt 50 ]; then
blocked_status="警告"
fi
output_result "SCHEDULER_BLOCKED_STATUS" "$blocked_status"
else
output_result "SCHEDULER_BLOCKED_STATUS" "未知"
fi
}
# ==================== 主检测流程 ====================
main() {
log_info "开始CPU资源检测..."
......@@ -223,6 +247,7 @@ main() {
check_softirqs
check_scheduler_runqueue
check_cpu_affinity
check_scheduler_blocked
log_info "CPU资源检测完成"
}
......
......@@ -212,7 +212,7 @@ check_longest_running() {
}
# 检测D状态不可中断睡眠进程
check uninterruptible_processes() {
check_uninterruptible_processes() {
local d_count=0
local d_processes=""
......@@ -272,6 +272,36 @@ check_process_connections_top5() {
fi
}
# 检测进程可执行文件路径
check_process_executable_paths() {
if command -v ps &> /dev/null; then
# 获取TOP10进程的可执行文件路径
local exe_paths=""
exe_paths=$(ps -eo pid,comm,exe --no-headers 2>/dev/null | sort -k3 -rn | head -10 | awk '{print $1":"$2":"$3}' | tr '\n' ',' | sed 's/,$//')
if [ -n "$exe_paths" ]; then
output_result "PROCESS_EXE_PATHS_TOP10" "$exe_paths"
else
output_result "PROCESS_EXE_PATHS_TOP10" "无数据"
fi
# 统计孤儿进程
local orphan_count=0
orphan_count=$(ps -eo pid,ppid,comm --no-headers 2>/dev/null | awk '$2==1 {print $1}' | wc -l)
output_result "PROCESS_ORPHAN_COUNT" "$orphan_count"
if [ "$orphan_count" -gt 10 ]; then
output_result "PROCESS_ORPHAN_STATUS" "严重"
elif [ "$orphan_count" -gt 0 ]; then
output_result "PROCESS_ORPHAN_STATUS" "警告"
else
output_result "PROCESS_ORPHAN_STATUS" "正常"
fi
else
output_result "PROCESS_EXE_PATHS_TOP10" "ps不可用"
fi
}
# ==================== 主检测流程 ====================
main() {
log_info "开始进程状态检测..."
......@@ -288,6 +318,7 @@ main() {
check_uninterruptible_processes
check_process_open_files_top5
check_process_connections_top5
check_process_executable_paths
log_info "进程状态检测完成"
}
......
......@@ -400,6 +400,57 @@ check_network_errors() {
fi
}
# 检测最近系统错误
check_journal_errors() {
local error_count=0
local recent_errors=""
# 使用dmesg代替journalctl(更快)
if check_command dmesg; then
local dmesg_errors
dmesg_errors=$(dmesg -T -l 2>/dev/null | grep -iE "error|fail|timeout|refused|denied" | tail -20)
error_count=$(echo "$dmesg_errors" | wc -l)
if [ "$error_count" -gt 0 ]; then
recent_errors=$(echo "$dmesg_errors" | head -10 | tr '\n' ',' | sed 's/,$//')
fi
fi
output_result "JOURNAL_ERROR_COUNT" "$error_count"
if [ -n "$recent_errors" ]; then
output_result "RECENT_SYSTEM_ERRORS" "$recent_errors"
else
output_result "RECENT_SYSTEM_ERRORS" "无"
fi
if [ "$error_count" -gt 50 ]; then
output_result "JOURNAL_ERROR_LEVEL" "严重"
elif [ "$error_count" -gt 10 ]; then
output_result "JOURNAL_ERROR_LEVEL" "警告"
else
output_result "JOURNAL_ERROR_LEVEL" "正常"
fi
}
# 检测OOM Killer日志详情
check_oom_logs_detail() {
local oom_details=""
local recent_oom=""
# 从dmesg获取OOM Killer详细信息
if check_command dmesg; then
oom_details=$(dmesg 2>/dev/null | grep -i "out of memory\|killed process")
recent_oom=$(echo "$oom_details" | tail -5 | tr '\n' ',' | sed 's/,$//')
fi
if [ -n "$recent_oom" ]; then
output_result "OOM_LOG_DETAILS" "$recent_oom"
else
output_result "OOM_LOG_DETAILS" "无"
fi
}
# ==================== 主检测流程 ====================
main() {
# 输出一个测试项确保模块被识别
......@@ -419,6 +470,8 @@ main() {
check_service_crashes 2>/dev/null || true
check_systemd_failures 2>/dev/null || true
check_oom_killer 2>/dev/null || true
check_oom_logs_detail 2>/dev/null || true
check_journal_errors 2>/dev/null || true
check_resource_exhaustion 2>/dev/null || true
check_hardware_errors 2>/dev/null || true
check_log_file_sizes 2>/dev/null || true
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论