提交 37be9a81 authored 作者: 陈泽健's avatar 陈泽健

refactor(security): 重构安全检测模块结构

- 将安全合规检测模块重命名为 43_security_compliance.sh
- 将系统日志检测模块重命名为 44_system_logs.sh
- 将时间同步检测模块重命名为 45_time_sync.sh
- 更新PowerShell脚本中的模块映射关系
- 添加安全合规检测相关的显示名称和阈值配置
- 添加系统日志检测相关的显示名称和阈值配置
- 添加时间同步检测相关的显示名称和阈值配置
- 移除原有的 08_security_check.sh、09_system_logs.sh 和 10_time_sync.sh 文件
上级 18da83b0
...@@ -354,6 +354,9 @@ function Get-ModuleCategory { ...@@ -354,6 +354,9 @@ function Get-ModuleCategory {
"^3[26]_" { return "FastDFS应用" } "^3[26]_" { return "FastDFS应用" }
"^33_" { return "应用日志" } "^33_" { return "应用日志" }
"^40_" { return "综合诊断" } "^40_" { return "综合诊断" }
"^43_" { return "安全合规检测" }
"^44_" { return "系统日志检测" }
"^45_" { return "时间同步检测" }
default { return "其他" } default { return "其他" }
} }
} }
...@@ -551,6 +554,108 @@ function Get-DisplayName { ...@@ -551,6 +554,108 @@ function Get-DisplayName {
"FASTDFS_HTTP_STATUS" = "FastDFS HTTP状态" "FASTDFS_HTTP_STATUS" = "FastDFS HTTP状态"
"FASTDFS_STORAGE_UPTIME_DAYS" = "FastDFS Storage运行天数" "FASTDFS_STORAGE_UPTIME_DAYS" = "FastDFS Storage运行天数"
"FASTDFS_GROUP_NAME" = "FastDFS组名" "FASTDFS_GROUP_NAME" = "FastDFS组名"
# 安全合规检测
"AUTH_FAILURES_24H" = "24小时认证失败次数"
"AUTH_FAILURES_LEVEL" = "认证失败等级"
"RECENT_LOGINS" = "最近登录记录"
"CURRENT_USERS_COUNT" = "当前登录用户数"
"CURRENT_USERS" = "当前登录用户"
"SELINUX_STATUS" = "SELinux状态"
"FIREWALL_STATUS" = "防火墙状态"
"FIREWALL_RULES" = "防火墙规则"
"IPTABLES_STATUS" = "iptables状态"
"IPTABLES_RULES_COUNT" = "iptables规则数量"
"OPEN_PORTS_COUNT" = "开放端口数量"
"ABNORMAL_ACCOUNTS" = "异常账户"
"ABNORMAL_ACCOUNTS_LEVEL" = "异常账户等级"
"SUSPICIOUS_SUID_COUNT" = "可疑SUID文件数量"
"SUSPICIOUS_SUID_FILES" = "可疑SUID文件"
"MODIFIED_CONF_COUNT" = "修改的配置文件数量"
"MODIFIED_CONF_FILES" = "修改的配置文件"
"ABNORMAL_CRON" = "异常cron任务"
"ABNORMAL_CRON_LEVEL" = "异常cron任务等级"
"MAX_LOGIN_FAILURES" = "最大登录失败次数"
"BRUTE_FORCE_IPS" = "暴力破解IP"
"BRUTE_FORCE_LEVEL" = "暴力破解等级"
"EMPTY_PASSWORD_ACCOUNTS" = "空密码账户"
"EMPTY_PASSWORD_LEVEL" = "空密码账户等级"
"SSH_PERMIT_ROOT" = "SSH允许root登录"
"SSH_PASSWORD_AUTH" = "SSH密码认证"
"SSH_PORT" = "SSH端口"
"SSH_DEFAULT_PORT" = "SSH使用默认端口"
"SSH_DEFAULT_PORT_LEVEL" = "SSH默认端口等级"
"SSH_ROOT_LOGIN_LEVEL" = "SSH root登录等级"
"SSH_PASSWORD_AUTH_LEVEL" = "SSH密码认证等级"
"OPEN_TCP_PORTS" = "开放TCP端口"
"OPEN_UDP_PORTS" = "开放UDP端口"
"HIGH_RISK_PORTS" = "高风险端口"
"HIGH_RISK_PORTS_LEVEL" = "高风险端口等级"
# 系统日志检测
"KERNEL_ERRORS_24H" = "24小时内核错误数"
"KERNEL_ERRORS_LEVEL" = "内核错误等级"
"DISK_ERRORS_24H" = "24小时磁盘错误数"
"DISK_ERRORS_DMESG" = "dmesg磁盘错误数"
"DISK_ERRORS_LEVEL" = "磁盘错误等级"
"DMESG_ERRORS_COUNT" = "dmesg错误数量"
"DMESG_ERROR_TYPES" = "dmesg错误类型"
"MESSAGES_LOG_STATUS" = "messages日志状态"
"MESSAGES_ERRORS_COUNT" = "messages错误数量"
"MESSAGES_WARNS_COUNT" = "messages警告数量"
"MESSAGES_LOG_SIZE" = "messages日志大小"
"KERNEL_PANIC_COUNT" = "内核panic数量"
"KERNEL_OOPS_COUNT" = "内核oops数量"
"KERNEL_CRASH_FILES" = "内核崩溃文件数量"
"KERNEL_STABILITY_LEVEL" = "内核稳定性等级"
"SERVICE_CRASH_COUNT" = "服务崩溃数量"
"CRASHED_SERVICES" = "崩溃的服务"
"SERVICE_STABILITY_LEVEL" = "服务稳定性等级"
"SYSTEMD_STATUS" = "systemd状态"
"SYSTEMD_FAILED_COUNT" = "systemd失败服务数量"
"SYSTEMD_FAILED_SERVICES" = "systemd失败服务"
"SYSTEMD_FAILED_LEVEL" = "systemd失败等级"
"OOM_KILLER_COUNT" = "OOM Killer数量"
"CORE_DUMP_FILES" = "Core dump文件数量"
"OOM_VICTIMS" = "OOM受害者"
"OOM_LEVEL" = "OOM等级"
"RESOURCE_EXHAUSTION_EVENTS" = "资源耗尽事件"
"RESOURCE_EXHAUSTION_LEVEL" = "资源耗尽等级"
"HARDWARE_ERRORS" = "硬件错误"
"HARDWARE_ERRORS_LEVEL" = "硬件错误等级"
"LARGE_LOG_FILES" = "大日志文件"
"LARGE_LOG_FILES_LEVEL" = "大日志文件等级"
"NETWORK_ERRORS" = "网络错误"
"NETWORK_ERRORS_LEVEL" = "网络错误等级"
# 时间同步检测
"NTP_SERVICE_STATUS" = "NTP服务状态"
"SYSTEM_CLOCK_SYNC" = "系统时钟同步"
"SYSTEM_CLOCK_SYNC_LEVEL" = "系统时钟同步等级"
"NTP_SERVICE_NAME" = "NTP服务名称"
"NTP_DAEMON" = "NTP守护进程"
"NTP_SOURCES" = "NTP同步源"
"NTP_SOURCES_COUNT" = "NTP同步源数量"
"NTP_CURRENT_SOURCE" = "NTP当前同步源"
"NTP_CONFIG_SOURCES" = "NTP配置同步源"
"NTP_OFFSET_MS" = "NTP时钟偏差(毫秒)"
"NTP_OFFSET_SEC" = "NTP时钟偏差(秒)"
"NTP_OFFSET_LEVEL" = "NTP时钟偏差等级"
"SYSTEM_DATETIME" = "系统时间"
"SYSTEM_TIMESTAMP" = "系统时间戳"
"SYSTEM_TIMEZONE" = "系统时区"
"HTTPS_CERT_INFO" = "HTTPS证书信息"
"HTTPS_CERT_MIN_DAYS" = "HTTPS证书最小剩余天数"
"HTTPS_CERT_LEVEL" = "HTTPS证书等级"
"HTTPS_CERT_STATUS" = "HTTPS证书状态"
"EMQX_CERT_INFO" = "EMQX证书信息"
"EMQX_CERT_MIN_DAYS" = "EMQX证书最小剩余天数"
"EMQX_CERT_LEVEL" = "EMQX证书等级"
"EMQX_CERT_STATUS" = "EMQX证书状态"
"MYSQL_CERT_MIN_DAYS" = "MySQL证书最小剩余天数"
"MYSQL_CERT_LEVEL" = "MySQL证书等级"
"SYSTEM_UPTIME_DAYS" = "系统运行天数"
"HWCLOCK_STATUS" = "硬件时钟状态"
} }
if ($displayNames.ContainsKey($Key)) { if ($displayNames.ContainsKey($Key)) {
...@@ -805,6 +910,56 @@ function Get-Threshold { ...@@ -805,6 +910,56 @@ function Get-Threshold {
"FASTDFS_RECENT_ERRORS" = ">10" "FASTDFS_RECENT_ERRORS" = ">10"
"FASTDFS_FILE_COUNT" = ">100000" "FASTDFS_FILE_COUNT" = ">100000"
"FASTDFS_HTTP_STATUS" = "0" "FASTDFS_HTTP_STATUS" = "0"
# ==================== 安全合规检测阈值 ====================
"AUTH_FAILURES_LEVEL" = ">100"
"ABNORMAL_ACCOUNTS" = ">0"
"ABNORMAL_ACCOUNTS_LEVEL" = ">0"
"MODIFIED_CONF_COUNT" = ">10"
"ABNORMAL_CRON" = "未发现异常"
"ABNORMAL_CRON_LEVEL" = "未发现异常"
"MAX_LOGIN_FAILURES" = ">20"
"BRUTE_FORCE_LEVEL" = ">20"
"EMPTY_PASSWORD_ACCOUNTS" = ">0"
"EMPTY_PASSWORD_LEVEL" = ">0"
"SSH_DEFAULT_PORT" = "是"
"SSH_DEFAULT_PORT_LEVEL" = "是"
"SSH_ROOT_LOGIN_LEVEL" = "是"
"SSH_PASSWORD_AUTH_LEVEL" = "是"
"HIGH_RISK_PORTS" = ">0"
"HIGH_RISK_PORTS_LEVEL" = ">0"
# ==================== 系统日志检测阈值 ====================
"KERNEL_ERRORS_LEVEL" = ">10"
"DISK_ERRORS_LEVEL" = ">5"
"KERNEL_PANIC_COUNT" = ">0"
"KERNEL_OOPS_COUNT" = ">5"
"KERNEL_STABILITY_LEVEL" = ">0"
"SERVICE_CRASH_COUNT" = ">5"
"SERVICE_STABILITY_LEVEL" = ">5"
"SYSTEMD_FAILED_COUNT" = ">0"
"SYSTEMD_FAILED_LEVEL" = ">0"
"OOM_KILLER_COUNT" = ">1"
"CORE_DUMP_FILES" = ">0"
"OOM_LEVEL" = ">0"
"RESOURCE_EXHAUSTION_EVENTS" = ">0"
"RESOURCE_EXHAUSTION_LEVEL" = ">0"
"HARDWARE_ERRORS" = ">0"
"HARDWARE_ERRORS_LEVEL" = ">0"
"LARGE_LOG_FILES_LEVEL" = ">0"
"NETWORK_ERRORS" = ">0"
"NETWORK_ERRORS_LEVEL" = ">0"
# ==================== 时间同步检测阈值 ====================
"SYSTEM_CLOCK_SYNC" = "未同步"
"SYSTEM_CLOCK_SYNC_LEVEL" = "未同步"
"NTP_OFFSET_LEVEL" = ">1秒"
"HTTPS_CERT_MIN_DAYS" = "<30天"
"HTTPS_CERT_LEVEL" = "<30天"
"EMQX_CERT_MIN_DAYS" = "<30天"
"EMQX_CERT_LEVEL" = "<30天"
"MYSQL_CERT_MIN_DAYS" = "<30天"
"MYSQL_CERT_LEVEL" = "<30天"
} }
if ($thresholds.ContainsKey($Key)) { if ($thresholds.ContainsKey($Key)) {
...@@ -910,8 +1065,8 @@ function Invoke-AllChecks { ...@@ -910,8 +1065,8 @@ function Invoke-AllChecks {
$systemModules = @( $systemModules = @(
"01_system_basic.sh", "02_cpu_check.sh", "03_memory_check.sh", "01_system_basic.sh", "02_cpu_check.sh", "03_memory_check.sh",
"04_disk_check.sh", "05_oom_check.sh", "06_process_check.sh", "04_disk_check.sh", "05_oom_check.sh", "06_process_check.sh",
"07_network_check.sh", "08_security_check.sh", "09_system_logs.sh", "07_network_check.sh", "43_security_compliance.sh", "44_system_logs.sh",
"10_time_sync.sh", "11_scheduled_tasks.sh", "12_port_check.sh" "45_time_sync.sh", "11_scheduled_tasks.sh", "12_port_check.sh"
) )
# 综合诊断模块(在所有模块之后执行) # 综合诊断模块(在所有模块之后执行)
......
#!/bin/bash
################################################################################
# 安全合规检测模块
# 功能: 检测SELinux、防火墙、开放端口、SSH配置、异常账户等安全相关项
# 作者: Claude Code
# 日期: 2026-05-09
################################################################################
# 获取脚本所在目录并加载依赖
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LIB_DIR="/tmp/check_modules"
# 加载配置文件和通用函数库
if [ -f "$LIB_DIR/config.sh" ]; then
source "$LIB_DIR/config.sh"
else
echo "ERROR: 配置文件不存在: $LIB_DIR/config.sh"
exit 1
fi
if [ -f "$LIB_DIR/common.sh" ]; then
source "$LIB_DIR/common.sh"
else
echo "ERROR: 通用函数库不存在: $LIB_DIR/common.sh"
exit 1
fi
# ==================== 检测函数 ====================
# 检测SELinux状态
check_selinux() {
local selinux_status=""
if command -v getenforce &> /dev/null; then
selinux_status=$(getenforce 2>/dev/null)
output_result "SELINUX_STATUS" "$selinux_status"
if [ "$selinux_status" = "Enforcing" ]; then
output_result "SELINUX_STATUS_LEVEL" "正常"
elif [ "$selinux_status" = "Permissive" ]; then
output_result "SELINUX_STATUS_LEVEL" "警告"
elif [ "$selinux_status" = "Disabled" ]; then
output_result "SELINUX_STATUS_LEVEL" "警告"
else
output_result "SELINUX_STATUS_LEVEL" "未知"
fi
else
output_result "SELINUX_STATUS" "未安装"
output_result "SELINUX_STATUS_LEVEL" "正常"
fi
}
# 检测防火墙状态
check_firewall() {
local fw_status="未运行"
local fw_level="警告"
# 检查 firewalld
if command -v systemctl &> /dev/null; then
if systemctl is-active firewalld &> /dev/null; then
fw_status="运行中"
fw_level="正常"
fi
fi
# 检查 ufw (Ubuntu)
if [ "$fw_status" = "未运行" ] && command -v ufw &> /dev/null; then
if ufw status | grep -q "Status: active"; then
fw_status="运行中"
fw_level="正常"
fi
fi
# 检查 iptables
if [ "$fw_status" = "未运行" ] && command -v iptables &> /dev/null; then
local iptables_rules=$(iptables -L -n 2>/dev/null | grep -c "^Chain")
if [ "$iptables_rules" -gt 0 ]; then
fw_status="iptables运行"
fw_level="正常"
fi
fi
output_result "FIREWALL_STATUS" "$fw_status"
output_result "FIREWALL_LEVEL" "$fw_level"
}
# 检测当前登录用户
check_logged_users() {
local user_count=0
if [ -f /usr/bin/who ]; then
user_count=$(who 2>/dev/null | wc -l)
elif [ -f /usr/bin/w ]; then
user_count=$(w 2>/dev/null | tail -n +2 | wc -l)
fi
output_result "LOGGED_USERS" "$user_count"
if [ "$user_count" -gt 10 ]; then
output_result "LOGGED_USERS_LEVEL" "警告"
else
output_result "LOGGED_USERS_LEVEL" "正常"
fi
}
# 检测开放端口
check_open_ports() {
local open_ports=0
if command -v ss &> /dev/null; then
open_ports=$(ss -tlnp 2>/dev/null | grep -c LISTEN || echo "0")
elif command -v netstat &> /dev/null; then
open_ports=$(netstat -tln 2>/dev/null | grep -c LISTEN || echo "0")
fi
output_result "OPEN_PORTS" "$open_ports"
if [ "$open_ports" -gt 50 ]; then
output_result "OPEN_PORTS_LEVEL" "警告"
else
output_result "OPEN_PORTS_LEVEL" "正常"
fi
}
# 检测认证失败历史
check_auth_failures() {
local fail_count=0
local status
# 从 journalctl 获取认证失败记录
if command -v journalctl &> /dev/null; then
fail_count=$(journalctl --since '24 hours ago' -t authpriv 2>/dev/null | grep -c "Failed password" || echo "0")
fi
# 备用方案:从 /var/log/messages 或 /var/log/auth.log
if [ "$fail_count" -eq 0 ]; then
if [ -f /var/log/auth.log ]; then
fail_count=$(grep -c "Failed password" /var/log/auth.log 2>/dev/null || echo "0")
elif [ -f /var/log/secure ]; then
fail_count=$(grep "Failed password" /var/log/secure 2>/dev/null | grep "$(date +%b %d)" | wc -l)
fi
fi
output_result "AUTH_FAIL_COUNT" "$fail_count"
if [ "$fail_count" -ge 1000 ]; then
status="严重"
elif [ "$fail_count" -ge 100 ]; then
status="警告"
else
status="正常"
fi
output_result "AUTH_FAIL_STATUS" "$status"
if [ "$status" != "正常" ]; then
echo "ERROR:认证失败次数过多(24h): ${fail_count}"
fi
}
# 检测异常账户(UID=0的非root账户)
check_anomalous_accounts() {
local uid_zero_accounts=""
local status="正常"
if [ -f /etc/passwd ]; then
uid_zero_accounts=$(awk -F: '$3==0 {print $1}' /etc/passwd 2>/dev/null | grep -v "^root$")
if [ -n "$uid_zero_accounts" ]; then
status="严重"
output_result "ANOMALOUS_ACCOUNTS" "$uid_zero_accounts"
echo "ERROR:发现异常UID=0账户: ${uid_zero_accounts}"
else
output_result "ANOMALOUS_ACCOUNTS" "无"
fi
fi
output_result "ANOMALOUS_ACCOUNTS_STATUS" "$status"
}
# 检测可疑SUID文件
check_suspicious_suid() {
local suid_count=0
local suid_files=""
local status="正常"
# 查找非常规路径的SUID文件
if command -v find &> /dev/null; then
suid_files=$(find / -perm -4000 -type f 2>/dev/null | grep -v -E '^/(usr|bin|sbin|lib|lib64)/' | head -10)
if [ -n "$suid_files" ]; then
suid_count=$(echo "$suid_files" | grep -c "^" || echo "0")
status="警告"
output_result "SUSPICIOUS_SUID_COUNT" "$suid_count"
echo "ERROR:发现可疑SUID文件: ${suid_count}个"
else
output_result "SUSPICIOUS_SUID_COUNT" "0"
fi
fi
output_result "SUSPICIOUS_SUID_STATUS" "$status"
}
# 检测SSH配置安全
check_ssh_config() {
local ssh_status="安全"
local ssh_level="正常"
local issues=""
if [ -f /etc/ssh/sshd_config ]; then
# 检查 PermitRootLogin
if grep -q "^PermitRootLogin yes" /etc/ssh/sshd_config 2>/dev/null; then
issues="${issues}允许root登录;"
ssh_status="存在风险"
ssh_level="警告"
fi
# 检查 PasswordAuthentication
if grep -q "^PasswordAuthentication yes" /etc/ssh/sshd_config 2>/dev/null; then
issues="${issues}允许密码认证;"
ssh_status="存在风险"
ssh_level="警告"
fi
# 检查 MaxAuthTries
local max_tries=$(grep "^MaxAuthTries" /etc/ssh/sshd_config 2>/dev/null | awk '{print $2}')
if [ -n "$max_tries" ] && [ "$max_tries" -gt 3 ]; then
issues="${issues}最大认证次数过高(${max_tries});"
ssh_status="存在风险"
ssh_level="警告"
fi
output_result "SSH_CONFIG_STATUS" "$ssh_status"
output_result "SSH_CONFIG_LEVEL" "$ssh_level"
if [ -n "$issues" ]; then
output_result "SSH_CONFIG_ISSUES" "$issues"
fi
else
output_result "SSH_CONFIG_STATUS" "配置文件不存在"
output_result "SSH_CONFIG_LEVEL" "未知"
fi
}
# 检测最近登录记录
check_recent_logins() {
local recent_logins=""
if command -v last &> /dev/null; then
recent_logins=$(last -n 5 -nohostname 2>/dev/null | head -5)
if [ -n "$recent_logins" ]; then
output_result "RECENT_LOGINS" "已获取"
else
output_result "RECENT_LOGINS" "无记录"
fi
else
output_result "RECENT_LOGINS" "命令不可用"
fi
}
# 检测密码策略
check_password_policy() {
local policy_status="未知"
if [ -f /etc/login.defs ]; then
local max_days=$(grep "^PASS_MAX_DAYS" /etc/login.defs 2>/dev/null | awk '{print $2}')
local min_days=$(grep "^PASS_MIN_DAYS" /etc/login.defs 2>/dev/null | awk '{print $2}')
if [ -n "$max_days" ]; then
output_result "PASS_MAX_DAYS" "$max_days"
fi
if [ -n "$min_days" ]; then
output_result "PASS_MIN_DAYS" "$min_days"
fi
policy_status="已配置"
fi
output_result "PASSWORD_POLICY" "$policy_status"
}
# ==================== 主检测流程 ====================
main() {
log_info "开始安全合规检测..."
# 执行各项检测
check_selinux
check_firewall
check_logged_users
check_open_ports
check_auth_failures
check_anomalous_accounts
check_suspicious_suid
check_ssh_config
check_recent_logins
check_password_policy
log_info "安全合规检测完成"
}
# 执行主函数
main
#!/bin/bash
################################################################################
# 系统日志检测模块
# 功能: 检测系统日志中的错误、警告、认证失败、服务失败等
# 作者: Claude Code
# 日期: 2026-05-09
################################################################################
# 获取脚本所在目录并加载依赖
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LIB_DIR="/tmp/check_modules"
# 加载配置文件和通用函数库
if [ -f "$LIB_DIR/config.sh" ]; then
source "$LIB_DIR/config.sh"
else
echo "ERROR: 配置文件不存在: $LIB_DIR/config.sh"
exit 1
fi
if [ -f "$LIB_DIR/common.sh" ]; then
source "$LIB_DIR/common.sh"
else
echo "ERROR: 通用函数库不存在: $LIB_DIR/common.sh"
exit 1
fi
# ==================== 检测函数 ====================
# 检测内核错误(24小时内)
check_kernel_errors() {
local error_count=0
if command -v journalctl &> /dev/null; then
error_count=$(journalctl -k --since '24 hours ago' --no-pager 2>/dev/null | grep -ci "error" || echo "0")
fi
output_result "KERNEL_ERRORS_24H" "$error_count"
if [ "$error_count" -gt 50 ]; then
output_result "KERNEL_ERRORS_STATUS" "警告"
else
output_result "KERNEL_ERRORS_STATUS" "正常"
fi
}
# 检测认证失败(24小时内)
check_auth_failures() {
local fail_count=0
local status
if command -v journalctl &> /dev/null; then
fail_count=$(journalctl --since '24 hours ago' --no-pager 2>/dev/null | grep -ci "authentication failure" || echo "0")
fi
if [ "$fail_count" -ge 1000 ]; then
status="严重"
elif [ "$fail_count" -ge 100 ]; then
status="警告"
else
status="正常"
fi
output_result "AUTH_FAILURES_24H" "$fail_count"
output_result "AUTH_FAILURES_STATUS" "$status"
if [ "$status" != "正常" ]; then
echo "ERROR:认证失败次数过多(24h): ${fail_count}"
fi
}
# 检测磁盘错误(24小时内)
check_disk_errors() {
local error_count=0
if command -v journalctl &> /dev/null; then
error_count=$(journalctl --since '24 hours ago' --no-pager 2>/dev/null | grep -ciE "I/O error|disk error" || echo "0")
fi
output_result "DISK_ERRORS_24H" "$error_count"
if [ "$error_count" -gt 0 ]; then
output_result "DISK_ERRORS_STATUS" "警告"
echo "ERROR:检测到磁盘错误: ${error_count} 个"
else
output_result "DISK_ERRORS_STATUS" "正常"
fi
}
# 检测OOM Killer日志(7天内)
check_oom_logs() {
local oom_count=0
if command -v journalctl &> /dev/null; then
oom_count=$(journalctl -k --since '7 days ago' --no-pager 2>/dev/null | grep -ciE "oom|out of memory|killed process" || echo "0")
fi
output_result "OOM_LOGS_7D" "$oom_count"
if [ "$oom_count" -gt 0 ]; then
output_result "OOM_LOGS_STATUS" "严重"
echo "ERROR:7天内有OOM记录"
else
output_result "OOM_LOGS_STATUS" "正常"
fi
}
# 检测systemd失败服务
check_failed_services() {
local failed_count=0
local failed_list=""
if command -v systemctl &> /dev/null; then
failed_list=$(systemctl list-units --type=service --state=failed --no-pager --no-legend 2>/dev/null)
failed_count=$(echo "$failed_list" | grep -c "^" || echo "0")
fi
output_result "FAILED_SERVICES" "$failed_count"
if [ "$failed_count" -gt 0 ]; then
output_result "FAILED_SERVICES_STATUS" "警告"
echo "ERROR:检测到${failed_count}个失败服务"
else
output_result "FAILED_SERVICES_STATUS" "正常"
fi
}
# 检测日志文件大小
check_log_file_sizes() {
local large_logs=0
local log_dirs=("/var/log" "/data/services")
for dir in "${log_dirs[@]}"; do
if [ -d "$dir" ]; then
while IFS= read -r log_file; do
if [ -f "$log_file" ]; then
local size=$(stat -f%z "$log_file" 2>/dev/null || stat -c%s "$log_file" 2>/dev/null)
# 检查是否大于1GB
if [ "$size" -gt 1073741824 ]; then
large_logs=$((large_logs + 1))
fi
fi
done < <(find "$dir" -name "*.log" -type f 2>/dev/null | head -10)
fi
done
output_result "LARGE_LOG_FILES" "$large_logs"
if [ "$large_logs" -gt 3 ]; then
output_result "LOG_SIZE_STATUS" "警告"
else
output_result "LOG_SIZE_STATUS" "正常"
fi
}
# 检测系统启动失败服务
check_boot_failed_services() {
local boot_failed=0
if command -v systemctl &> /dev/null; then
boot_failed=$(systemctl list-units --type=service --state=failed --no-pager --no-legend 2>/dev/null | grep -c "^" || echo "0")
fi
output_result "BOOT_FAILED_SERVICES" "$boot_failed"
if [ "$boot_failed" -gt 0 ]; then
output_result "BOOT_FAILED_STATUS" "警告"
else
output_result "BOOT_FAILED_STATUS" "正常"
fi
}
# 检测journald日志状态
check_journal_status() {
local journal_status="正常"
local disk_usage=""
if command -v journalctl &> /dev/null; then
# 检查journald磁盘使用情况
disk_usage=$(journalctl --disk-usage 2>/dev/null | tail -1)
if [ -n "$disk_usage" ]; then
output_result "JOURNAL_DISK_USAGE" "$disk_usage"
fi
# 检查journald是否运行
if systemctl is-active systemd-journald &> /dev/null; then
journal_status="运行中"
else
journal_status="未运行"
fi
else
journal_status="未安装"
fi
output_result "JOURNAL_STATUS" "$journal_status"
}
# 检测系统异常日志关键词
check_error_keywords() {
local keywords=("panic" "segfault" "corruption" "malware" "intrusion")
local total_errors=0
if command -v journalctl &> /dev/null; then
for keyword in "${keywords[@]}"; do
local count=$(journalctl --since '24 hours ago' --no-pager 2>/dev/null | grep -ci "$keyword" || echo "0")
total_errors=$((total_errors + count))
done
fi
output_result "ERROR_KEYWORDS_24H" "$total_errors"
if [ "$total_errors" -gt 0 ]; then
output_result "ERROR_KEYWORDS_STATUS" "严重"
else
output_result "ERROR_KEYWORDS_STATUS" "正常"
fi
}
# 检测syslog服务状态
check_syslog_status() {
local syslog_status="正常"
# 检查 rsyslog
if command -v systemctl &> /dev/null; then
if systemctl is-active rsyslog &> /dev/null; then
syslog_status="rsyslog运行中"
elif systemctl is-active syslog-ng &> /dev/null; then
syslog_status="syslog-ng运行中"
elif systemctl is-active syslog &> /dev/null; then
syslog_status="syslog运行中"
else
syslog_status="syslog服务未运行"
fi
fi
output_result "SYSLOG_STATUS" "$syslog_status"
}
# ==================== 主检测流程 ====================
main() {
log_info "开始系统日志检测..."
# 执行各项检测
check_kernel_errors
check_auth_failures
check_disk_errors
check_oom_logs
check_failed_services
check_log_file_sizes
check_boot_failed_services
check_journal_status
check_error_keywords
check_syslog_status
log_info "系统日志检测完成"
}
# 执行主函数
main
#!/bin/bash
################################################################################
# 时间同步检测模块
# 功能: 检测NTP同步状态、时钟偏差、系统时间、SSL证书有效期等
# 作者: Claude Code
# 日期: 2026-05-09
################################################################################
# 获取脚本所在目录并加载依赖
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LIB_DIR="/tmp/check_modules"
# 加载配置文件和通用函数库
if [ -f "$LIB_DIR/config.sh" ]; then
source "$LIB_DIR/config.sh"
else
echo "ERROR: 配置文件不存在: $LIB_DIR/config.sh"
exit 1
fi
if [ -f "$LIB_DIR/common.sh" ]; then
source "$LIB_DIR/common.sh"
else
echo "ERROR: 通用函数库不存在: $LIB_DIR/common.sh"
exit 1
fi
# ==================== 检测函数 ====================
# 检测NTP同步状态
check_ntp_status() {
local sync_status="未知"
local ntp_service=""
# 使用 timedatectl 检查
if command -v timedatectl &> /dev/null; then
local timedate_output
timedate_output=$(timedatectl status 2>/dev/null)
if echo "$timedate_output" | grep -q "System clock synchronized: yes"; then
sync_status="已同步"
elif echo "$timedate_output" | grep -q "System clock synchronized: no"; then
sync_status="未同步"
fi
# 检查NTP服务状态
if echo "$timedate_output" | grep -q "NTP service: active"; then
ntp_service="运行中"
else
ntp_service="未运行"
fi
fi
output_result "NTP_SYNC_STATUS" "$sync_status"
output_result "NTP_SERVICE" "$ntp_service"
if [ "$sync_status" = "未同步" ]; then
output_result "NTP_STATUS_LEVEL" "警告"
else
output_result "NTP_STATUS_LEVEL" "正常"
fi
}
# 检测时钟偏差
check_clock_offset() {
local offset=0
local status="正常"
# 使用 chronyc 检查
if command -v chronyc &> /dev/null; then
local tracking_output
tracking_output=$(chronyc tracking 2>/dev/null)
if [ -n "$tracking_output" ]; then
# 提取 Last offset
offset=$(echo "$tracking_output" | grep "Last offset" | sed 's/.*Last offset.*:\s*\(-\?[0-9.]*\).*/\1/')
if [ -n "$offset" ]; then
# 取绝对值
offset=$(echo "$offset" | awk '{if ($1 < 0) print -$1; else print $1}')
# 判断状态
if (( $(awk "BEGIN {print ($offset >= 5)}") )); then
status="严重"
elif (( $(awk "BEGIN {print ($offset >= 1)}") )); then
status="警告"
fi
output_result "CLOCK_OFFSET" "${offset}秒"
output_result "CLOCK_OFFSET_STATUS" "$status"
if [ "$status" != "正常" ]; then
echo "ERROR:时钟偏差过大: ${offset}秒"
fi
fi
fi
fi
# 如果没有chronyc,尝试使用 ntpq
if [ "$offset" -eq 0 ] && command -v ntpq &> /dev/null; then
local ntpq_output
ntpq_output=$(ntpq -pn 2>/dev/null | grep "*" | head -1)
if [ -n "$ntpq_output" ]; then
# 提取偏移量(第6列)
local ntp_offset=$(echo "$ntpq_output" | awk '{print $6}' | sed 's/[-+]//')
offset=$(echo "$ntp_offset" | awk '{printf "%.3f", $1/1000}')
if (( $(awk "BEGIN {print ($offset >= 5)}") )); then
status="严重"
elif (( $(awk "BEGIN {print ($offset >= 1)}") )); then
status="警告"
fi
output_result "CLOCK_OFFSET" "${offset}秒"
output_result "CLOCK_OFFSET_STATUS" "$status"
fi
fi
}
# 检测系统时间
check_system_time() {
local sys_time=""
local utc_time=""
if command -v date &> /dev/null; then
sys_time=$(date 2>/dev/null)
utc_time=$(date -u 2>/dev/null)
fi
if [ -n "$sys_time" ]; then
output_result "SYSTEM_TIME" "$sys_time"
fi
if [ -n "$utc_time" ]; then
output_result "UTC_TIME" "$utc_time"
fi
}
# 检测时区配置
check_timezone() {
local timezone=""
if [ -f /etc/timezone ]; then
timezone=$(cat /etc/timezone 2>/dev/null)
elif [ -f /etc/localtime ]; then
# 尝试从 /etc/localtime 链接推断时区
if [ -L /etc/localtime ]; then
timezone=$(readlink /etc/localtime 2>/dev/null | sed 's|.*/zoneinfo/||')
fi
fi
if [ -n "$timezone" ]; then
output_result "TIMEZONE" "$timezone"
else
output_result "TIMEZONE" "未知"
fi
}
# 检测SSL证书有效期
check_ssl_cert() {
local cert_status="正常"
local days_left=0
local expiry_date=""
if command -v openssl &> /dev/null; then
# 检查本地443端口SSL证书
local cert_info
cert_info=$(echo | openssl s_client -connect localhost:443 2>/dev/null | openssl x509 -noout -dates 2>/dev/null)
if [ -n "$cert_info" ]; then
expiry_date=$(echo "$cert_info" | grep "notAfter=" | sed 's/notAfter=//')
if [ -n "$expiry_date" ]; then
# 将证书日期转换为时间戳
local cert_ts
cert_ts=$(date -d "$expiry_date" +%s 2>/dev/null)
if [ -n "$cert_ts" ]; then
local current_ts
current_ts=$(date +%s)
days_left=$(( (cert_ts - current_ts) / 86400 ))
if [ "$days_left" -lt 0 ]; then
cert_status="严重"
output_result "SSL_CERT_STATUS" "已过期"
echo "ERROR:SSL证书已过期"
elif [ "$days_left" -lt 7 ]; then
cert_status="严重"
output_result "SSL_CERT_STATUS" "即将过期"
echo "ERROR:SSL证书将在${days_left}天后过期"
elif [ "$days_left" -lt 30 ]; then
cert_status="警告"
output_result "SSL_CERT_STATUS" "即将过期"
else
output_result "SSL_CERT_STATUS" "正常"
fi
output_result "SSL_CERT_DAYS_LEFT" "$days_left"
output_result "SSL_CERT_EXPIRY" "$expiry_date"
output_result "SSL_CERT_LEVEL" "$cert_status"
fi
fi
fi
else
output_result "SSL_CERT_STATUS" "openssl不可用"
fi
}
# 检测EMQX SSL证书
check_emqx_cert() {
local cert_status="正常"
local days_left=0
# 检查容器是否运行
if is_container_running "${CONTAINERS[emqx]}"; then
if command -v docker &> /dev/null; then
local cert_info
cert_info=$(docker exec "${CONTAINERS[emqx]}" sh -c "echo | openssl s_client -connect localhost:8883 2>/dev/null | openssl x509 -noout -dates 2>/dev/null" 2>/dev/null)
if [ -n "$cert_info" ]; then
local expiry_date
expiry_date=$(echo "$cert_info" | grep "notAfter=" | sed 's/notAfter=//')
if [ -n "$expiry_date" ]; then
local cert_ts
cert_ts=$(date -d "$expiry_date" +%s 2>/dev/null)
if [ -n "$cert_ts" ]; then
local current_ts
current_ts=$(date +%s)
days_left=$(( (cert_ts - current_ts) / 86400 ))
if [ "$days_left" -lt 7 ]; then
cert_status="严重"
elif [ "$days_left" -lt 30 ]; then
cert_status="警告"
fi
output_result "EMQX_CERT_DAYS_LEFT" "$days_left"
output_result "EMQX_CERT_LEVEL" "$cert_status"
fi
fi
fi
fi
else
output_result "EMQX_CERT_STATUS" "容器未运行"
fi
}
# 检测时间同步服务
check_timesync_service() {
local service_status="未知"
local service_name=""
# 检查常见的时间同步服务
if command -v systemctl &> /dev/null; then
if systemctl is-active systemd-timesyncd &> /dev/null; then
service_name="systemd-timesyncd"
service_status="运行中"
elif systemctl is-active chronyd &> /dev/null; then
service_name="chronyd"
service_status="运行中"
elif systemctl is-active ntpd &> /dev/null; then
service_name="ntpd"
service_status="运行中"
else
service_status="未运行"
fi
fi
output_result "TIMESYNC_SERVICE" "$service_name"
output_result "TIMESYNC_STATUS" "$service_status"
if [ "$service_status" = "未运行" ]; then
output_result "TIMESYNC_LEVEL" "警告"
else
output_result "TIMESYNC_LEVEL" "正常"
fi
}
# ==================== 主检测流程 ====================
main() {
log_info "开始时间同步检测..."
# 执行各项检测
check_ntp_status
check_clock_offset
check_system_time
check_timezone
check_timesync_service
check_ssl_cert
check_emqx_cert
log_info "时间同步检测完成"
}
# 执行主函数
main
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论