提交 cd877ad5 authored 作者: 陈泽健's avatar 陈泽健

feat(system): 添加系统资源检测功能

- 新增系统资源限制、内核启动参数和内核关键参数检测
- 扩展CPU检测,增加中断统计、软中断、调度器队列和CPU亲和性检测
- 添加内存压力、虚拟内存统计、Slab缓存和大页内存检测
- 增加磁盘延迟、NFS挂载、RAID状态、LVM和磁盘调度算法检测
- 新增不可中断进程、进程文件句柄和网络连接数TOP5检测
- 添加TCP参数、扩展统计、ARP表、路由表和Socket统计检测
- 扩展PowerShell脚本模块映射和国际化
上级 37be9a81
......@@ -342,7 +342,15 @@ function Get-ModuleCategory {
)
switch -Regex ($ModuleName) {
"^(0[1-9]|1[0-2])_" { return "系统基础信息" }
"^01_" { return "系统基础信息" }
"^02_" { return "CPU检测" }
"^03_" { return "内存检测" }
"^04_" { return "磁盘检测" }
"^05_" { return "OOM检测" }
"^06_" { return "进程检测" }
"^07_" { return "网络检测" }
"^11_" { return "计划任务" }
"^12_" { return "端口检测" }
"^(20|21)_" { return "Docker容器" }
"^2[23]_" { return "MySQL数据库" }
"^2[45]_" { return "Redis缓存" }
......@@ -415,6 +423,92 @@ function Get-DisplayName {
"CPU_USAGE" = "CPU使用率"
"MEMORY_USAGE" = "内存使用率"
"DISK_USAGE" = "磁盘使用率"
"ULIMIT_INFO" = "系统资源限制"
"KERNEL_CMDLINE" = "内核启动参数"
"KERNEL_PARAM_FS_FILE_MAX" = "文件描述符最大值"
"KERNEL_PARAM_INOTIFY_MAX_WATCHES" = "inotify最大监听数"
"KERNEL_PARAM_SOMAXCONN" = "监听队列最大长度"
"KERNEL_PARAM_TCP_TW_REUSE" = "TCP TIME_WAIT重用"
"KERNEL_PARAM_TCP_FIN_TIMEOUT" = "TCP FIN超时时间"
"CPU_USER" = "CPU用户态使用率"
"CPU_SYSTEM" = "CPU系统态使用率"
"CPU_PER_CORE" = "各核心CPU使用率"
"CPU_TOP10_PROCESSES" = "CPU占用TOP10进程"
"CPU_CONTEXT_SWITCHES" = "CPU上下文切换次数"
"CPU_INTERRUPTS" = "CPU中断次数"
"IRQ_DETAIL" = "中断详情"
"SOFTIRQS" = "软中断统计"
"SCHEDULER_RUNQUEUE" = "调度器运行队列长度"
"CPU_AFFINITY_SAMPLE" = "CPU亲和性示例"
"MEMORY_USED" = "已用内存"
"MEMORY_FREE" = "空闲内存"
"MEMORY_TOTAL" = "总内存"
"MEMORY_BUFFERS" = "内存缓冲区"
"MEMORY_CACHED" = "内存缓存"
"MEMORY_TOP5_PROCESSES" = "内存占用TOP5进程"
"MEMORY_PRESSURE_AVG10" = "内存压力(avg10)"
"MEMORY_PRESSURE_AVG60" = "内存压力(avg60)"
"VM_PGMajFAULT" = "主缺页异常次数"
"VM_PSWPIN" = "换入页面数"
"VM_PSWPOUT" = "换出页面数"
"SLAB_TOTAL" = "Slab缓存总量"
"SLAB_RECLAIMABLE" = "可回收Slab缓存"
"HUGEPAGES_TOTAL" = "大页总数"
"HUGEPAGES_FREE" = "空闲大页数"
"HUGEPAGES_SIZE" = "大页大小"
"TRANSPARENT_HUGEPAGE" = "透明大页状态"
"FD_ALLOCATED" = "已分配文件描述符"
"FD_MAXIMUM" = "最大文件描述符"
"FD_STATUS" = "文件描述符状态"
"LOAD_CORES" = "CPU核心数"
"LOAD_RATIO" = "负载与核心数比值"
"LOAD_STATUS" = "系统负载状态"
"PROCESS_TOP10_CPU" = "CPU占用TOP10进程"
"PROCESS_TOP10_MEMORY" = "内存占用TOP10进程"
"PROCESS_COUNT" = "进程总数"
"PROCESS_LONGEST_RUNNING" = "运行时间最长进程"
"ZOMBIE_COUNT" = "僵尸进程数"
"ZOMBIE_STATUS" = "僵尸进程状态"
"THREAD_STATUS" = "线程状态"
"UNINTERRUPTIBLE_COUNT" = "不可中断进程数"
"UNINTERRUPTIBLE_PROCESSES" = "不可中断进程详情"
"UNINTERRUPTIBLE_STATUS" = "不可中断进程状态"
"PROCESS_OPEN_FILES_TOP5" = "打开文件数TOP5进程"
"PROCESS_CONNECTIONS_TOP5" = "网络连接数TOP5进程"
"TCP_ESTABLISHED" = "ESTABLISHED连接数"
"TCP_TIME_WAIT" = "TIME_WAIT连接数"
"TCP_TIME_WAIT_STATUS" = "TIME_WAIT连接状态"
"TCP_CLOSE_WAIT" = "CLOSE_WAIT连接数"
"TCP_CLOSE_WAIT_STATUS" = "CLOSE_WAIT连接状态"
"NET_INTERFACE_UP" = "启用网络接口数"
"NET_INTERFACE_TOTAL" = "总网络接口数"
"NET_INTERFACE_STATUS" = "网络接口状态"
"NET_ERRORS" = "网络错误"
"NET_ERRORS_STATUS" = "网络错误状态"
"NET_TRAFFIC" = "网络流量"
"DNS_RESOLUTION" = "DNS解析状态"
"NET_HOSTNAME" = "主机名"
"NET_GATEWAY" = "默认网关"
"NET_GATEWAY_STATUS" = "网关状态"
"TCP_EXTENDED_STATS" = "TCP扩展统计"
"ARP_ENTRIES" = "ARP表项"
"ARP_COUNT" = "ARP条目数"
"ROUTES_COUNT" = "路由条目数"
"SOCKET_STATS" = "Socket统计"
"NET_BANDWIDTH_DETAIL" = "网络带宽详情"
"LISTEN_QUEUE_BACKLOG" = "监听队列积压"
"GATEWAY_PING_STATUS" = "网关连通性"
"CONTAINER_NETWORK_TO_EMQX" = "到EMQX容器网络"
"DISK_IO_STATUS" = "磁盘IO状态"
"DISK_SMART_STATUS" = "磁盘SMART状态"
"DISK_PARTITIONS" = "磁盘分区信息"
"DISK_LATENCY" = "磁盘延迟"
"REMOTE_MOUNTS" = "远程挂载点"
"REMOTE_MOUNTS_COUNT" = "远程挂载数量"
"RAID_DEVICES" = "RAID设备"
"RAID_STATUS" = "RAID状态"
"LVM_VOLUMES" = "LVM卷信息"
"DISK_SCHEDULER" = "磁盘调度器"
# Redis基础
"KEY_COUNT" = "Redis键数量"
......@@ -960,6 +1054,24 @@ function Get-Threshold {
"EMQX_CERT_LEVEL" = "<30天"
"MYSQL_CERT_MIN_DAYS" = "<30天"
"MYSQL_CERT_LEVEL" = "<30天"
# ==================== 系统基础信息新增阈值 ====================
"TCP_TIME_WAIT_STATUS" = "严重"
"TCP_CLOSE_WAIT_STATUS" = "严重"
"NET_INTERFACE_STATUS" = "严重"
"NET_ERRORS_STATUS" = "警告"
"DNS_RESOLUTION" = "警告"
"NET_GATEWAY_STATUS" = "警告"
"GATEWAY_PING_STATUS" = "异常"
"FD_STATUS" = "严重"
"LOAD_STATUS" = "严重"
"ZOMBIE_STATUS" = "严重"
"THREAD_STATUS" = "严重"
"UNINTERRUPTIBLE_STATUS" = "严重"
"DISK_SMART_STATUS" = "严重"
"RAID_STATUS" = "degraded"
"MEMORY_PRESSURE_AVG10" = ">1"
"VM_PGMajFAULT" = ">100"
}
if ($thresholds.ContainsKey($Key)) {
......
......@@ -114,6 +114,75 @@ check_architecture() {
output_result "ARCHITECTURE" "$arch"
}
# 检测系统资源限制
check_ulimit() {
local ulimit_info=""
if check_command ulimit; then
ulimit_info=$(ulimit -a 2>/dev/null | head -20 | tr '\n' ',' | sed 's/,$//')
fi
if [ -n "$ulimit_info" ]; then
output_result "ULIMIT_INFO" "$ulimit_info"
else
output_result "ULIMIT_INFO" "无法获取"
fi
}
# 检测内核启动参数
check_kernel_cmdline() {
local cmdline=""
if [ -f /proc/cmdline ]; then
cmdline=$(cat /proc/cmdline 2>/dev/null | tr ' ' ',')
# 截取前200个字符避免过长
cmdline=${cmdline:0:200}
if [ ${#cmdline} -eq 200 ]; then
cmdline="${cmdline}..."
fi
fi
if [ -n "$cmdline" ]; then
output_result "KERNEL_CMDLINE" "$cmdline"
else
output_result "KERNEL_CMDLINE" "无法读取"
fi
}
# 检测内核关键参数
check_kernel_params() {
# 文件描述符限制
local file_max=""
if [ -f /proc/sys/fs/file-max ]; then
file_max=$(cat /proc/sys/fs/file-max 2>/dev/null)
fi
[ -n "$file_max" ] && output_result "FS_FILE_MAX" "$file_max"
# inotify监控限制
local inotify_max_watches=""
if [ -f /proc/sys/fs/inotify/max_user_watches ]; then
inotify_max_watches=$(cat /proc/sys/fs/inotify/max_user_watches 2>/dev/null)
fi
[ -n "$inotify_max_watches" ] && output_result "INOTIFY_MAX_WATCHES" "$inotify_max_watches"
# TCP连接队列
local somaxconn=""
if [ -f /proc/sys/net/core/somaxconn ]; then
somaxconn=$(cat /proc/sys/net/core/somaxconn 2>/dev/null)
fi
[ -n "$somaxconn" ] && output_result "NET_SOMAXCONN" "$somaxconn"
# TCP TIME_WAIT超时
local tcp_tw_timeout=""
if [ -f /proc/sys/net/ipv4/tcp_fin_timeout ]; then
tcp_tw_timeout=$(cat /proc/sys/net/ipv4/tcp_fin_timeout 2>/dev/null)
fi
[ -n "$tcp_tw_timeout" ] && output_result "TCP_FIN_TIMEOUT" "$tcp_tw_timeout"
# TCP保活时间
local tcp_keepalive_time=""
if [ -f /proc/sys/net/ipv4/tcp_keepalive_time ]; then
tcp_keepalive_time=$(cat /proc/sys/net/ipv4/tcp_keepalive_time 2>/dev/null)
fi
[ -n "$tcp_keepalive_time" ] && output_result "TCP_KEEPALIVE_TIME" "$tcp_keepalive_time"
}
# ==================== 主检测流程 ====================
main() {
log_info "开始系统基础信息检测..."
......@@ -128,6 +197,9 @@ main() {
check_memory_total
check_boot_time
check_architecture
check_ulimit
check_kernel_cmdline
check_kernel_params
log_info "系统基础信息检测完成"
}
......
......@@ -105,27 +105,29 @@ check_cpu_per_core() {
fi
}
# 检测CPU占用TOP5进程
check_cpu_top5() {
local top5
top5=$(ps -eo pid,comm,%cpu,%mem --no-headers 2>/dev/null | sort -k3 -rn | head -5)
# 检测CPU占用TOP15进程
check_cpu_top15() {
local top15
top15=$(ps -eo pid,comm,%cpu --no-headers 2>/dev/null | sort -k3 -rn | head -15)
if [ -n "$top5" ]; then
# 格式化输出
if [ -n "$top15" ]; then
# 格式化输出,只取前10个避免过长
local formatted=""
local count=0
while IFS= read -r line; do
if [ -n "$line" ]; then
if [ -n "$line" ] && [ $count -lt 10 ]; then
if [ -n "$formatted" ]; then
formatted="${formatted}; ${line}"
else
formatted="${line}"
fi
count=$((count + 1))
fi
done <<< "$top5"
done <<< "$top15"
output_result "CPU_TOP5_PROCESSES" "$formatted"
output_result "CPU_TOP10_PROCESSES" "$formatted"
else
output_result "CPU_TOP5_PROCESSES" "获取失败"
output_result "CPU_TOP10_PROCESSES" "获取失败"
fi
}
......@@ -157,6 +159,56 @@ check_cpu_interrupts() {
fi
}
# 检测详细中断统计(/proc/interrupts)
check_interrupts_detail() {
if [ -f /proc/interrupts ]; then
# 获取前10个中断的统计
local interrupts
interrupts=$(head -10 /proc/interrupts 2>/dev/null | grep -v "^$" | tr '\n' ',' | sed 's/,$//')
if [ -n "$interrupts" ]; then
output_result "IRQ_DETAIL" "$interrupts"
fi
fi
}
# 检测软中断统计
check_softirqs() {
if [ -f /proc/softirqs ]; then
local softirqs
softirqs=$(cat /proc/softirqs 2>/dev/null | tr '\n' ',' | sed 's/,$//')
if [ -n "$softirqs" ]; then
output_result "SOFTIRQS" "$softirqs"
fi
fi
}
# 检测CPU调度器运行队列长度
check_scheduler_runqueue() {
local runqueue
runqueue=$(awk '/runnable/ {print $2}' /proc/stat 2>/dev/null)
if [ -n "$runqueue" ]; then
output_result "SCHEDULER_RUNQUEUE" "$runqueue"
else
output_result "SCHEDULER_RUNQUEUE" "未知"
fi
}
# 检测进程CPU亲和性示例
check_cpu_affinity() {
# 获取init进程的CPU亲和性作为示例
local affinity=""
if [ -f /proc/1/status ]; then
affinity=$(grep "Cpus_allowed_list" /proc/1/status 2>/dev/null | awk '{print $2}')
fi
if [ -n "$affinity" ]; then
output_result "CPU_AFFINITY_SAMPLE" "$affinity"
else
output_result "CPU_AFFINITY_SAMPLE" "未知"
fi
}
# ==================== 主检测流程 ====================
main() {
log_info "开始CPU资源检测..."
......@@ -164,9 +216,13 @@ main() {
# 执行各项检测
check_cpu_usage
check_cpu_per_core
check_cpu_top5
check_cpu_top15
check_cpu_context_switches
check_cpu_interrupts
check_interrupts_detail
check_softirqs
check_scheduler_runqueue
check_cpu_affinity
log_info "CPU资源检测完成"
}
......
......@@ -167,6 +167,77 @@ check_memory_top5() {
fi
}
# 检测内存压力
check_memory_pressure() {
if [ -f /proc/pressure/memory ]; then
local pressure_info
pressure_info=$(cat /proc/pressure/memory 2>/dev/null | head -1)
if [ -n "$pressure_info" ]; then
# 解析一些关键指标
local avg10=$(echo "$pressure_info" | grep -oP 'avg10=\K[\d.]+' || echo "0")
local avg60=$(echo "$pressure_info" | grep -oP 'avg60=\K[\d.]+' || echo "0")
output_result "MEMORY_PRESSURE_AVG10" "$avg10"
output_result "MEMORY_PRESSURE_AVG60" "$avg60"
fi
else
output_result "MEMORY_PRESSURE" "不支持"
fi
}
# 检测虚拟内存统计
check_vm_stat() {
if [ -f /proc/vmstat ]; then
# 获取一些关键指标
local pgmajfault pswpin pswpout
pgmajfault=$(awk '/pgmajfault/ {print $2}' /proc/vmstat 2>/dev/null || echo "0")
pswpin=$(awk '/pswpin/ {print $2}' /proc/vmstat 2>/dev/null || echo "0")
pswpout=$(awk '/pswpout/ {print $2}' /proc/vmstat 2>/dev/null || echo "0")
output_result "VM_PGMajFAULT" "$pgmajfault"
output_result "VM_PSWPIN" "$pswpin"
output_result "VM_PSWPOUT" "$pswpout"
fi
}
# 检测Slab缓存
check_slab_info() {
if [ -f /proc/meminfo ]; then
local slab_total slab_reclaimable
slab_total=$(awk '/^Slab:/ {print $2}' /proc/meminfo 2>/dev/null)
slab_reclaimable=$(awk '/^SReclaimable:/ {print $2}' /proc/meminfo 2>/dev/null)
if [ -n "$slab_total" ]; then
local slab_mb=$(awk "BEGIN {printf \"%.2f\", $slab_total/1024}")
output_result "SLAB_TOTAL" "${slab_mb}MB"
fi
if [ -n "$slab_reclaimable" ]; then
local reclaimable_mb=$(awk "BEGIN {printf \"%.2f\", $slab_reclaimable/1024}")
output_result "SLAB_RECLAIMABLE" "${reclaimable_mb}MB"
fi
fi
}
# 检测大页内存
check_hugepages() {
if [ -f /proc/meminfo ]; then
local huge_total huge_free huge_size
huge_total=$(awk '/^HugePages_Total:/ {print $2}' /proc/meminfo 2>/dev/null)
huge_free=$(awk '/^HugePages_Free:/ {print $2}' /proc/meminfo 2>/dev/null)
huge_size=$(awk '/^Hugepagesize:/ {print $2}' /proc/meminfo 2>/dev/null)
output_result "HUGEPAGES_TOTAL" "${huge_total:-0}"
output_result "HUGEPAGES_FREE" "${huge_free:-0}"
output_result "HUGEPAGES_SIZE" "${huge_size:-未知}"
# 检查是否启用了透明大页
if [ -f /sys/kernel/mm/transparent_hugepage/enabled ]; then
local thp_enabled=$(cat /sys/kernel/mm/transparent_hugepage/enabled 2>/dev/null)
output_result "TRANSPARENT_HUGEPAGE" "$thp_enabled"
fi
fi
}
# ==================== 主检测流程 ====================
main() {
log_info "开始内存资源检测..."
......@@ -177,6 +248,10 @@ main() {
check_memory_details
check_numa
check_memory_top5
check_memory_pressure
check_vm_stat
check_slab_info
check_hugepages
log_info "内存资源检测完成"
}
......
......@@ -207,6 +207,89 @@ check_disk_speed() {
fi
}
# 检测磁盘延迟统计
check_disk_latency() {
if [ -f /proc/diskstats ]; then
# 获取前几个磁盘的延迟信息
local disk_latency
disk_latency=$(head -5 /proc/diskstats 2>/dev/null | awk '{print $3":"$4":"$7":"$8}' | tr '\n' ',' | sed 's/,$//')
if [ -n "$disk_latency" ]; then
output_result "DISK_LATENCY" "$disk_latency"
fi
fi
}
# 检查网络文件系统挂载
check_nfs_mounts() {
local nfs_mounts
nfs_mounts=$(mount | grep -E "(nfs|cifs|smbfs|fuse.sshfs)" 2>/dev/null | awk '{print $1":"$3}' | tr '\n' ',' | sed 's/,$//')
if [ -n "$nfs_mounts" ]; then
output_result "REMOTE_MOUNTS" "$nfs_mounts"
output_result "REMOTE_MOUNTS_COUNT" "$(echo "$nfs_mounts" | tr ',' '\n' | wc -l)"
else
output_result "REMOTE_MOUNTS_COUNT" "0"
fi
}
# 检测RAID状态
check_raid_status() {
if [ -f /proc/mdstat ]; then
local mdstat
mdstat=$(cat /proc/mdstat 2>/dev/null)
if [ -n "$mdstat" ]; then
# 检查是否有RAID设备
if echo "$mdstat" | grep -q "md"; then
local raid_devices
raid_devices=$(echo "$mdstat" | grep -oE "md[0-9]+" | tr '\n' ',' | sed 's/,$//')
output_result "RAID_DEVICES" "$raid_devices"
# 检查RAID状态
if echo "$mdstat" | grep -q "\[UU+\]"; then
output_result "RAID_STATUS" "正常"
elif echo "$mdstat" | grep -q "\[UU\]"; then
output_result "RAID_STATUS" "正常"
else
output_result "RAID_STATUS" " degraded"
fi
else
output_result "RAID_STATUS" "无RAID设备"
fi
else
output_result "RAID_STATUS" "无法读取"
fi
else
output_result "RAID_STATUS" "无/proc/mdstat"
fi
}
# 检测LVM状态
check_lvm_status() {
if command -v lvs &> /dev/null; then
local lvm_info
lvm_info=$(lvs --noheadings --separator ',' -o lv_name,lv_size,lv_attr 2>/dev/null)
if [ -n "$lvm_info" ]; then
output_result "LVM_VOLUMES" "$lvm_info"
fi
else
output_result "LVM_STATUS" "未安装lvs"
fi
}
# 检测磁盘调度算法
check_disk_scheduler() {
local scheduler=""
# 检查第一个块设备的调度器
for disk in /sys/block/*; do
if [ -d "$disk/queue/scheduler" ]; then
local dev_name=$(basename "$disk")
scheduler=$(cat "$disk/queue/scheduler" 2>/dev/null | grep -oP '\[[^\]]+\]' | tr -d '[]')
output_result "DISK_SCHEDULER_${dev_name}" "$scheduler"
break
fi
done
}
# ==================== 主检测流程 ====================
main() {
log_info "开始磁盘资源检测..."
......@@ -217,6 +300,11 @@ main() {
check_disk_io
check_disk_smart
check_disk_partitions
check_disk_latency
check_nfs_mounts
check_raid_status
check_lvm_status
check_disk_scheduler
# check_disk_speed # 可选,比较耗时
log_info "磁盘资源检测完成"
......
......@@ -211,6 +211,67 @@ check_longest_running() {
fi
}
# 检测D状态不可中断睡眠进程
check uninterruptible_processes() {
local d_count=0
local d_processes=""
# 统计D状态进程
d_processes=$(ps -eo stat,pid,comm --no-headers 2>/dev/null | grep " D" | head -10)
d_count=$(echo "$d_processes" | wc -l)
if [ -n "$d_processes" ]; then
d_processes=$(echo "$d_processes" | tr '\n' ',' | sed 's/,$//')
fi
output_result "UNINTERRUPTIBLE_COUNT" "$d_count"
if [ -n "$d_processes" ]; then
output_result "UNINTERRUPTIBLE_PROCESSES" "$d_processes"
fi
if [ "$d_count" -gt 5 ]; then
output_result "UNINTERRUPTIBLE_STATUS" "严重"
elif [ "$d_count" -gt 0 ]; then
output_result "UNINTERRUPTIBLE_STATUS" "警告"
else
output_result "UNINTERRUPTIBLE_STATUS" "正常"
fi
}
# 按进程统计打开文件数TOP5
check_process_open_files_top5() {
if command -v lsof &> /dev/null; then
local top5
top5=$(lsof 2>/dev/null | awk '{print $2}' | sort | uniq -c | sort -rn | head -5)
if [ -n "$top5" ]; then
output_result "PROCESS_OPEN_FILES_TOP5" "$top5"
else
output_result "PROCESS_OPEN_FILES_TOP5" "无数据"
fi
else
output_result "PROCESS_OPEN_FILES_TOP5" "lsof不可用"
fi
}
# 按进程统计网络连接数TOP5
check_process_connections_top5() {
if command -v ss &> /dev/null; then
local top5
# 获取各进程的网络连接数
top5=$(ss -ntp 2>/dev/null | awk '{print $7}' | sort | uniq -c | sort -rn | head -5 | grep -v "PID")
if [ -n "$top5" ]; then
output_result "PROCESS_CONNECTIONS_TOP5" "$top5"
else
output_result "PROCESS_CONNECTIONS_TOP5" "无数据"
fi
else
output_result "PROCESS_CONNECTIONS_TOP5" "ss不可用"
fi
}
# ==================== 主检测流程 ====================
main() {
log_info "开始进程状态检测..."
......@@ -224,6 +285,9 @@ main() {
check_process_top_cpu
check_process_top_memory
check_longest_running
check_uninterruptible_processes
check_process_open_files_top5
check_process_connections_top5
log_info "进程状态检测完成"
}
......
......@@ -215,6 +215,146 @@ check_network_config() {
fi
}
# 检测TCP参数配置
check_tcp_params() {
# 检查关键TCP参数
local tcp_tw_reuse=""
local tcp_tw_recycle=""
local tcp_fin_timeout=""
if [ -f /proc/sys/net/ipv4/tcp_tw_reuse ]; then
tcp_tw_reuse=$(cat /proc/sys/net/ipv4/tcp_tw_reuse 2>/dev/null)
fi
if [ -f /proc/sys/net/ipv4/tcp_tw_recycle ]; then
tcp_tw_recycle=$(cat /proc/sys/net/ipv4/tcp_tw_recycle 2>/dev/null)
fi
if [ -f /proc/sys/net/ipv4/tcp_fin_timeout ]; then
tcp_fin_timeout=$(cat /proc/sys/net/ipv4/tcp_fin_timeout 2>/dev/null)
fi
[ -n "$tcp_tw_reuse" ] && output_result "TCP_TW_REUSE" "$tcp_tw_reuse"
[ -n "$tcp_tw_recycle" ] && output_result "TCP_TW_RECYCLE" "$tcp_tw_recycle"
[ -n "$tcp_fin_timeout" ] && output_result "TCP_FIN_TIMEOUT_PARAM" "$tcp_fin_timeout"
}
# 检测TCP扩展统计
check_tcp_extended_stats() {
if [ -f /proc/net/snmp ]; then
# 获取TCP扩展统计
local tcp_stats=""
tcp_stats=$(cat /proc/net/snmp 2>/dev/null | grep "^Tcp:" | head -1)
if [ -n "$tcp_stats" ]; then
output_result "TCP_EXT_STATS" "$(echo "$tcp_stats" | cut -c1-200)" # 截取前200字符
fi
fi
}
# 检测ARP表
check_arp_table() {
local arp_count=""
local arp_entries=""
if command -v ip &> /dev/null; then
arp_entries=$(ip neigh show 2>/dev/null | grep -v "FAILED" | head -10 | tr '\n' ',' | sed 's/,$//')
arp_count=$(ip neigh show 2>/dev/null | grep -v "FAILED" | wc -l)
fi
[ -n "$arp_entries" ] && output_result "ARP_ENTRIES" "$arp_entries"
[ -n "$arp_count" ] && output_result "ARP_COUNT" "$arp_count"
}
# 检测路由表
check_routing_table() {
if command -v ip &> /dev/null; then
local route_count=""
route_count=$(ip route show 2>/dev/null | wc -l)
output_result "ROUTES_COUNT" "$route_count"
fi
}
# 检测Socket统计
check_socket_stats() {
if [ -f /proc/net/sockstat ]; then
local sockstat
sockstat=$(cat /proc/net/sockstat 2>/dev/null | tr '\n' ',' | sed 's/,$//')
if [ -n "$sockstat" ]; then
output_result "SOCKET_STATS" "$sockstat"
fi
fi
}
# 检测网络带宽统计详情
check_network_bandwidth() {
if [ -f /proc/net/dev ]; then
# 获取主要网络接口的带宽统计
local bandwidth_info=""
bandwidth_info=$(tail -n +3 /proc/net/dev | head -3 | awk '{print $1":"$2":"$10}' | tr '\n' ',' | sed 's/,$//')
if [ -n "$bandwidth_info" ]; then
output_result "NET_BANDWIDTH_DETAIL" "$bandwidth_info"
fi
fi
}
# 检测监听队列积压
check_listen_queue() {
if command -v ss &> /dev/null; then
# 检查监听队列积压情况
local queue_full=""
queue_full=$(ss -lnt 2>/dev/null | awk '{print $6}' | grep -v "0" | head -5)
if [ -n "$queue_full" ]; then
output_result "LISTEN_QUEUE_BACKLOG" "$queue_full"
else
output_result "LISTEN_QUEUE_BACKLOG" "正常"
fi
else
output_result "LISTEN_QUEUE_BACKLOG" "ss不可用"
fi
}
# 检测网关连通性
check_gateway_connectivity() {
local gateway=""
local ping_status="未测试"
# 获取默认网关
if command -v ip &> /dev/null; then
gateway=$(ip route | grep default | awk '{print $3}' | head -1)
elif command -v route &> /dev/null; then
gateway=$(route -n | grep "^0.0.0.0" | awk '{print $2}' | head -1)
fi
if [ -n "$gateway" ] && command -v ping &> /dev/null; then
# ping网关3次,每次超时1秒
if ping -c 3 -W 1 "$gateway" &>/dev/null; then
ping_status="正常"
else
ping_status="异常"
fi
fi
output_result "GATEWAY_PING_STATUS" "$ping_status"
}
# 检测容器间网络连通性
check_container_network() {
# 检查Docker容器间网络连通性
if command -v docker &> /dev/null; then
# ping uemqx容器
local uemqx_ip=""
uemqx_ip=$(docker inspect --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' uemqx 2>/dev/null | head -1)
if [ -n "$uemqx_ip" ] && command -v ping &> /dev/null; then
if ping -c 1 -W 1 "$uemqx_ip" &>/dev/null; then
output_result "CONTAINER_NETWORK_TO_EMQX" "正常"
else
output_result "CONTAINER_NETWORK_TO_EMQX" "异常"
fi
fi
fi
}
# ==================== 主检测流程 ====================
main() {
log_info "开始网络连接检测..."
......@@ -227,6 +367,15 @@ main() {
check_network_traffic
check_dns_resolution
check_network_config
check_tcp_params
check_tcp_extended_stats
check_arp_table
check_routing_table
check_socket_stats
check_network_bandwidth
check_listen_queue
check_gateway_connectivity
check_container_network
log_info "网络连接检测完成"
}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论