提交 d6c7bfc3 authored 作者: 陈泽健's avatar 陈泽健

feat(monitoring): 添加自动化服务监测Shell脚本

- 实现日志暴涨检测功能,按总行数窗口监控日志变化
- 添加ERROR上下文检测,提取最近错误日志的上下文信息
- 集成内存监控功能,记录内存使用峰值和平均值
- 实现MySQL连接数监控,支持容器和本机两种方式
- 支持新旧平台自动识别,适配不同部署环境
- 提供完整的日志记录和告警机制
- 实现定时循环监控,支持自定义监控间隔
上级 85f3c1fe
This source diff could not be displayed because it is too large. You can view the blob instead.
''''bash
# filepath: /data/scripts/AutomatedServiceMonitoring.sh
#!/usr/bin/env bash
########################################
# 自动化服务监测 Shell 脚本(单机版)
# 在哪台服务器上运行,就监控哪台服务器(本机)
########################################
set -u # 未定义变量直接报错
# 如需调试可打开:set -x
#################### 全局配置 ####################
# 监控间隔(秒)
INTERVAL_SECONDS=60
# MySQL 账号(与 PRD 保持一致)
MYSQL_USER="root"
MYSQL_PASSWORD="dNrprU&2S"
# 日志文件(默认写当前目录)
LOG_FILE="./AutomatedServiceMonitoring.sh.log"
# 当前主机标识
HOST_NAME="$(hostname)"
#################### 简单日志函数 ####################
log() {
# $1: 级别 INFO/WARN/ERROR
# $2+: 消息
local level="$1"; shift
local ts
ts="$(date '+%Y-%m-%d %H:%M:%S')"
local msg="[$ts] [$level] $*"
echo "$msg"
echo "$msg" >>"$LOG_FILE"
}
#################### 平台识别 ####################
detect_platform() {
# 输出两个全局变量:
# PLATFORM_TYPE="new" 或 "legacy"
# BASE_PATH="/data/services" 或 "/var/www"
if [ -d "/data/services" ]; then
PLATFORM_TYPE="new"
BASE_PATH="/data/services"
else
PLATFORM_TYPE="legacy"
BASE_PATH="/var/www"
fi
}
#################### 系统识别(ujava/upython/upython_voice) ####################
detect_systems() {
HAS_UJAVA=0
HAS_UPYTHON=0
HAS_UPYTHON_VOICE=0
SYSTEMS=() # bash 数组:meeting / ops / transcription
# 检查 docker 容器名
local names
names="$(docker ps --format '{{.Names}}' 2>/dev/null || true)"
if echo "$names" | grep -q "ujava"; then
HAS_UJAVA=1
SYSTEMS+=("meeting")
fi
if echo "$names" | grep -q "upython_voice"; then
HAS_UPYTHON_VOICE=1
SYSTEMS+=("transcription")
fi
if echo "$names" | grep -q "upython"; then
HAS_UPYTHON=1
SYSTEMS+=("ops")
fi
}
#################### 日志目标解析 ####################
resolve_log_targets() {
# 根据 PLATFORM_TYPE / BASE_PATH / SYSTEMS 生成日志列表
# 结果放到全局数组:LOG_TARGET_SYS[] / LOG_TARGET_PATH[]
LOG_TARGET_SYS=()
LOG_TARGET_PATH=()
local has_meeting=0
for s in "${SYSTEMS[@]}"; do
if [[ "$s" == "meeting" ]]; then
has_meeting=1
fi
done
if [[ $has_meeting -eq 1 ]]; then
if [[ "$PLATFORM_TYPE" == "new" ]]; then
LOG_TARGET_SYS+=("meeting-2.0")
LOG_TARGET_PATH+=("$BASE_PATH/api/java-meeting/java-meeting2.0/logs/ubains-INFO-AND-ERROR.log")
LOG_TARGET_SYS+=("meeting-3.0")
LOG_TARGET_PATH+=("$BASE_PATH/api/java-meeting/java-meeting3.0/logs/ubains-INFO-AND-ERROR.log")
else
LOG_TARGET_SYS+=("meeting-2.0")
LOG_TARGET_PATH+=("/var/www/java/api-java-meeting2.0/logs/ubains-INFO-AND-ERROR.log")
fi
fi
}
#################### 日志暴涨检测(按总行数窗口) ####################
# 使用关联数组记录上一次总行数和时间戳: key=日志路径
declare -A BURST_LAST_TOTAL
declare -A BURST_LAST_TS
monitor_log_burst_once() {
local sys_name="$1"
local log_path="$2"
local window_seconds=300
local min_lines_threshold=1000
local rate_threshold_per_sec=5
# 检查文件是否存在
if [ ! -f "$log_path" ]; then
log INFO "[日志暴涨审计] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path 文件不存在"
return
fi
local out
out="$(wc -l < "$log_path" 2>/dev/null || echo 0)"
local total_lines
total_lines=$(echo "$out" | tr -d ' ')
if ! [[ "$total_lines" =~ ^[0-9]+$ ]]; then
log INFO "[日志暴涨审计] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path 无法获取总行数"
return
fi
local now_ts
now_ts="$(date +%s)"
local key="$log_path"
local last_total="${BURST_LAST_TOTAL[$key]-}"
local last_ts="${BURST_LAST_TS[$key]-}"
if [[ -z "${last_total:-}" || -z "${last_ts:-}" ]]; then
BURST_LAST_TOTAL["$key"]="$total_lines"
BURST_LAST_TS["$key"]="$now_ts"
log INFO "[日志暴涨审计] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path 初始化窗口,总行数=$total_lines"
return
fi
local elapsed=$(( now_ts - last_ts ))
if (( elapsed <= 0 )); then
elapsed=1
fi
local delta_lines=$(( total_lines - last_total ))
if (( delta_lines < 0 )); then
delta_lines=0
fi
# 更新状态
BURST_LAST_TOTAL["$key"]="$total_lines"
BURST_LAST_TS["$key"]="$now_ts"
if (( elapsed < window_seconds )); then
log INFO "[日志暴涨审计] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path 累积中:窗口=${elapsed}s 未达 ${window_seconds}s,新增行数=${delta_lines}"
return
fi
# shell 里做速率计算
local rate
rate=$(awk -v d="$delta_lines" -v e="$elapsed" 'BEGIN{ if(e<=0){e=1}; printf "%.2f", d/e }')
local start_ts_human end_ts_human
start_ts_human="$(date -d @"$last_ts" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')"
end_ts_human="$(date -d @"$now_ts" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')"
if (( delta_lines >= min_lines_threshold )) || awk -v r="$rate" -v th="$rate_threshold_per_sec" 'BEGIN{exit !(r>=th)}'; then
log WARN "[日志打印暴涨] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path 窗口=${elapsed}s 新增行数=${delta_lines} 速率=${rate}行/秒 时间段=[${start_ts_human} ~ ${end_ts_human}]"
else
log INFO "[日志暴涨审计] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path 窗口=${elapsed}s 新增行数=${delta_lines} 速率=${rate}行/秒 未发现暴涨"
fi
}
#################### ERROR 上下文检测 ####################
monitor_log_errors_once() {
local sys_name="$1"
local log_path="$2"
local max_context=50
local max_sections=3
if [ ! -f "$log_path" ]; then
log INFO "[ERROR审计] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path 文件不存在"
return
fi
# 最近 5000 行
local tail_out
tail_out="$(tail -n 5000 "$log_path" 2>/dev/null || true)"
if [[ -z "${tail_out//[[:space:]]/}" ]]; then
log INFO "[ERROR审计] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path 最近5000行无内容"
return
fi
# ERROR 数量
local cnt
cnt="$(echo "$tail_out" | grep -c "ERROR" || true)"
if ! [[ "$cnt" =~ ^[0-9]+$ ]]; then
cnt=0
fi
if (( cnt == 0 )); then
log INFO "[ERROR审计] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path 最近段落未发现 ERROR"
return
fi
# 上下文抓取:对最近 5000 行做 nl 标号,取最近几处 ERROR
local ctx
ctx="$(echo "$tail_out" | nl -ba | \
awk '{if($0 ~ /ERROR/) print $1}' | tail -n "$max_sections" | \
while read -r ln; do
start=$((ln - max_context)); end=$((ln + max_context))
if [ "$start" -lt 1 ]; then start=1; fi
echo "$tail_out" | sed -n "${start},${end}p"
echo
echo "---- 上下文分隔线 (行号: $ln) ----"
echo
done
)"
# 本地提取时间戳(YYYY-MM-DD HH:MM:SS)
local start_ts end_ts
start_ts="$(echo "$ctx" | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}' | head -n1)"
end_ts="$(echo "$ctx" | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}' | tail -n1)"
if [[ -n "$start_ts" || -n "$end_ts" ]]; then
log WARN "[ERROR出现] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path 最近5000行 ERROR数量=$cnt 发生时间段=[$start_ts ~ $end_ts]"
else
log WARN "[ERROR出现] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path 最近5000行 ERROR数量=$cnt"
fi
log INFO "[ERROR上下文] 主机=$HOST_NAME 系统=$sys_name 日志=$log_path
$ctx"
}
#################### 内存监控(/proc/meminfo) ####################
# 按主机维度记录峰值和平均
MEM_SAMPLES=0
MEM_SUM_USED_MB=0
MEM_PEAK_USED_MB=0
MEM_PEAK_TS=0
monitor_mem_once() {
local meminfo
meminfo="$(cat /proc/meminfo 2>/dev/null)" || {
log ERROR "[内存监测] $HOST_NAME => 获取 /proc/meminfo 失败"
return
}
local total_kb avail_kb
total_kb="$(echo "$meminfo" | awk '/^MemTotal:/ {print $2}')"
avail_kb="$(echo "$meminfo" | awk '/^MemAvailable:/ {print $2}')"
if [[ -z "$total_kb" || -z "$avail_kb" ]]; then
log ERROR "[内存监测] $HOST_NAME => 解析失败"
return
fi
local total_mb used_mb
total_mb=$(awk -v t="$total_kb" 'BEGIN{printf "%.0f",t/1024}')
used_mb=$(awk -v t="$total_kb" -v a="$avail_kb" 'BEGIN{u=(t-a)/1024; if(u<0)u=0; printf "%.0f",u}')
MEM_SAMPLES=$((MEM_SAMPLES + 1))
MEM_SUM_USED_MB=$((MEM_SUM_USED_MB + used_mb))
if (( used_mb > MEM_PEAK_USED_MB )); then
MEM_PEAK_USED_MB=$used_mb
MEM_PEAK_TS="$(date +%s)"
fi
local avg_used
avg_used=$(( MEM_SUM_USED_MB / MEM_SAMPLES ))
local peak_human="N/A"
if [[ "$MEM_PEAK_TS" != "0" && -n "$MEM_PEAK_TS" ]]; then
peak_human="$(date -d @"$MEM_PEAK_TS" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')"
fi
log INFO "[内存监测] $HOST_NAME 当前使用=${used_mb}MB 总=${total_mb}MB 平均=${avg_used}MB 峰值=${MEM_PEAK_USED_MB}MB@$peak_human"
}
#################### MySQL 连接数监控 ####################
MYSQL_SAMPLES=0
MYSQL_SUM_CONN=0
MYSQL_PEAK_CONN=0
MYSQL_PEAK_TS=0
MYSQL_LAST_TOTAL=""
MYSQL_LAST_TS=""
find_mysql_container() {
# 回显容器名或空
docker ps --format '{{.Names}} {{.Image}}' 2>/dev/null | \
awk '{
name=$1; $1=""; image=$0;
low=tolower(name " " image);
if(index(low,"mysql")>0 || index(low,"mariadb")>0){
print name; exit 0;
}
}'
}
get_mysql_threads_connected_via_container() {
local container="$1"
local auth="-u${MYSQL_USER} -p${MYSQL_PASSWORD}"
local out
out="$(docker exec -i "${container}" mysql -ss ${auth} -e "SHOW STATUS LIKE 'Threads_connected';" 2>/dev/null | tail -n1 | cut -f2)"
if [[ "$out" =~ ^[0-9]+$ ]]; then
echo "$out"
return 0
fi
out="$(docker exec -i "${container}" mysqladmin ${auth} status 2>/dev/null)"
local val
val="$(echo "$out" | grep -oE 'Threads:[[:space:]]*[0-9]+' | awk '{print $2}')"
if [[ "$val" =~ ^[0-9]+$ ]]; then
echo "$val"
return 0
fi
return 1
}
monitor_mysql_once() {
local conn=""
local container
container="$(find_mysql_container || true)"
if [[ -n "$container" ]]; then
conn="$(get_mysql_threads_connected_via_container "$container" || true)"
if ! [[ "$conn" =~ ^[0-9]+$ ]]; then
log ERROR "[MySQL监测] $HOST_NAME => 在容器 $container 内获取连接数失败"
conn=""
fi
else
log INFO "[MySQL监测] $HOST_NAME => 未检测到 MySQL/MariaDB 容器,回退本机命令"
fi
if [[ -z "$conn" ]]; then
local auth="-u${MYSQL_USER} -p${MYSQL_PASSWORD}"
local out
out="$(mysqladmin ${auth} status 2>/dev/null || true)"
conn="$(echo "$out" | grep -oE 'Threads:[[:space:]]*[0-9]+' | awk '{print $2}')"
fi
if [[ -z "$conn" || ! "$conn" =~ ^[0-9]+$ ]]; then
local auth="-u${MYSQL_USER} -p${MYSQL_PASSWORD}"
local out2
out2="$(mysql -ss ${auth} -e "SHOW STATUS LIKE 'Threads_connected';" 2>/dev/null | tail -n1 | cut -f2)"
if [[ "$out2" =~ ^[0-9]+$ ]]; then
conn="$out2"
fi
fi
if [[ -z "$conn" || ! "$conn" =~ ^[0-9]+$ ]]; then
log ERROR "[MySQL监测] $HOST_NAME => 获取连接数失败(请确认 Docker/MySQL 客户端与权限)"
return
fi
local now_ts
now_ts="$(date +%s)"
MYSQL_SAMPLES=$((MYSQL_SAMPLES + 1))
MYSQL_SUM_CONN=$((MYSQL_SUM_CONN + conn))
if (( conn > MYSQL_PEAK_CONN )); then
MYSQL_PEAK_CONN=$conn
MYSQL_PEAK_TS="$now_ts"
fi
local avg_conn
avg_conn=$(( MYSQL_SUM_CONN / MYSQL_SAMPLES ))
local peak_human="N/A"
if [[ -n "$MYSQL_PEAK_TS" && "$MYSQL_PEAK_TS" != "0" ]]; then
peak_human="$(date -d @"$MYSQL_PEAK_TS" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')"
fi
log INFO "[MySQL监测] $HOST_NAME 当前连接数=$conn 平均=$avg_conn 峰值=${MYSQL_PEAK_CONN}@$peak_human"
# 暴涨判定
local window_seconds=300
local min_burst_conn=200
local rate_threshold_per_sec=1
if [[ -z "${MYSQL_LAST_TOTAL:-}" || -z "${MYSQL_LAST_TS:-}" ]]; then
MYSQL_LAST_TOTAL="$conn"
MYSQL_LAST_TS="$now_ts"
return
fi
local elapsed=$(( now_ts - MYSQL_LAST_TS ))
if (( elapsed <= 0 )); then elapsed=1; fi
local delta_conn=$(( conn - MYSQL_LAST_TOTAL ))
if (( delta_conn < 0 )); then delta_conn=0; fi
MYSQL_LAST_TOTAL="$conn"
MYSQL_LAST_TS="$now_ts"
if (( elapsed >= window_seconds )); then
local rate
rate=$(awk -v d="$delta_conn" -v e="$elapsed" 'BEGIN{ if(e<=0){e=1}; printf "%.2f", d/e }')
local start_ts_human end_ts_human
start_ts_human="$(date -d @"$((now_ts - elapsed))" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')"
end_ts_human="$(date -d @"$now_ts" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')"
if (( delta_conn >= min_burst_conn )) || awk -v r="$rate" -v th="$rate_threshold_per_sec" 'BEGIN{exit !(r>=th)}'; then
log WARN "[MySQL连接暴涨] $HOST_NAME 窗口=${elapsed}s 增量=${delta_conn} 速率=${rate}/s 时间段=[${start_ts_human} ~ ${end_ts_human}]"
else
log INFO "[MySQL暴涨审计] $HOST_NAME 窗口=${elapsed}s 增量=${delta_conn} 速率=${rate}/s 未发现暴涨"
fi
fi
}
#################### 主循环:监控本机 ####################
main_loop() {
log INFO "[启动] 自动化服务监测 Shell 脚本,目标服务器(本机)=${HOST_NAME}"
detect_platform
detect_systems
log INFO "[平台识别] $HOST_NAME => 平台类型=$([[ "$PLATFORM_TYPE" == "new" ]] && echo 新统一平台(/data/services) || echo 传统平台(/var/www)) 基路径=${BASE_PATH}"
log INFO "[系统识别] $HOST_NAME => ujava=${HAS_UJAVA} upython=${HAS_UPYTHON} upython_voice=${HAS_UPYTHON_VOICE} 系统=(${SYSTEMS[*]:-})"
while true; do
log INFO "[心跳] $HOST_NAME: 平台=$([[ "$PLATFORM_TYPE" == "new" ]] && echo || echo ) 基路径=${BASE_PATH} 系统=${SYSTEMS[*]:-}"
resolve_log_targets
# 日志审计
local i
for ((i=0; i<${#LOG_TARGET_SYS[@]}; i++)); do
local sys_name="${LOG_TARGET_SYS[$i]}"
local log_path="${LOG_TARGET_PATH[$i]}"
monitor_log_burst_once "$sys_name" "$log_path"
monitor_log_errors_once "$sys_name" "$log_path"
done
# 内存监测
monitor_mem_once
# MySQL 监测
monitor_mysql_once
sleep "$INTERVAL_SECONDS"
done
}
#################### 脚本入口 ####################
main_loop
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论