提交 7a9d4d6f authored 作者: 陈泽健's avatar 陈泽健

优化日志监控的异常处理。处理路径拼接方式,处理服务出现错误日志没有发送的问题,调试重复发送问题。

上级 fe624c3b
...@@ -36,7 +36,7 @@ def dingding_send_message(error_log_url,ding_type): ...@@ -36,7 +36,7 @@ def dingding_send_message(error_log_url,ding_type):
log_type = '展厅预定-对内服务服务监测' log_type = '展厅预定-对内服务服务监测'
elif ding_type == '展厅预定系统-对外服务服务监测': elif ding_type == '展厅预定系统-对外服务服务监测':
log_type = '展厅预定-对外服务服务监测' log_type = '展厅预定-对外服务日志出现异常'
logging.info(f"预定服务日志类型:{log_type}") logging.info(f"预定服务日志类型:{log_type}")
......
...@@ -8,4 +8,4 @@ ...@@ -8,4 +8,4 @@
- 处理运行12小时后被远程主机主动断开连接问题,通过配置 SSH Client 的 keepalive 参数,让连接保持活跃,避免超时断开。 - 处理运行12小时后被远程主机主动断开连接问题,通过配置 SSH Client 的 keepalive 参数,让连接保持活跃,避免超时断开。
- 增加多台服务器的连接监测,补充对展厅的日志监测,调整error_log日志文件存放路径,补充对应的ngrok映射目录。 - 增加多台服务器的连接监测,补充对展厅的日志监测,调整error_log日志文件存放路径,补充对应的ngrok映射目录。
3. 2025-06-09: 3. 2025-06-09:
- 优化日志监控的异常处理。 - 优化日志监控的异常处理。处理路径拼接方式,处理服务出现错误日志没有发送的问题,调试重复发送问题。
\ No newline at end of file \ No newline at end of file
...@@ -5,6 +5,7 @@ import logging ...@@ -5,6 +5,7 @@ import logging
import sys import sys
import os import os
import json import json
import socket # 用于捕获 socket.error
# 配置日志输出到控制台 # 配置日志输出到控制台
console_handler = logging.StreamHandler() console_handler = logging.StreamHandler()
...@@ -37,21 +38,24 @@ except ImportError as e: ...@@ -37,21 +38,24 @@ except ImportError as e:
class LogMonitor: class LogMonitor:
def __init__(self, host, username, private_key_path, passphrase, log_path, check_interval=1, ding_type="标准版服务监测"): def __init__(self, host, username, private_key_path, passphrase, log_path,
check_interval=1, ding_type="标准版服务监测", resend_interval=300):
self.host = host self.host = host
self.username = username self.username = username
self.private_key_path = private_key_path self.private_key_path = private_key_path
self.passphrase = passphrase self.passphrase = passphrase
self.log_path = log_path self.log_path = log_path
self.check_interval = check_interval # 日志检查间隔(秒) self.check_interval = check_interval
self.ding_type = ding_type # 钉钉群标识 self.ding_type = ding_type
self.client = None self.client = None
self.channel = None self.channel = None
self.collecting = False self.collecting = False
self.lock = threading.Lock() self.lock = threading.Lock()
self.line_buffer = [] # 缓存最近若干行日志,用于上下文提取 self.line_buffer = []
self.buffer_size = 200 # 缓存最多保留多少行日志 self.buffer_size = 500
self.error_contexts = [] # 存储所有错误日志的上下文 self.error_contexts = []
self.sent_errors = {} # 已发送的错误日志 {hash: last_send_time}
self.resend_interval = resend_interval # 钉钉重发冷却时间(秒)
def connect(self): def connect(self):
try: try:
...@@ -68,8 +72,8 @@ class LogMonitor: ...@@ -68,8 +72,8 @@ class LogMonitor:
) )
self.channel = self.client.invoke_shell() self.channel = self.client.invoke_shell()
self.channel.setblocking(0) # 设置为非阻塞模式 self.channel.setblocking(0)
self.channel.transport.set_keepalive(30) # 每隔 30 秒发一次 keepalive 包 self.channel.transport.set_keepalive(30)
self.channel.send(f"tail -f {self.log_path}\n") self.channel.send(f"tail -f {self.log_path}\n")
logging.info(f"Connected to {self.host}, monitoring {self.log_path}") logging.info(f"Connected to {self.host}, monitoring {self.log_path}")
...@@ -105,11 +109,16 @@ class LogMonitor: ...@@ -105,11 +109,16 @@ class LogMonitor:
while self.collecting: while self.collecting:
try: try:
if self.channel.recv_ready(): if self.channel.recv_ready():
... data = self.channel.recv(1024).decode('utf-8', errors='ignore')
logging.debug("Received raw data: %s", data)
for line in data.splitlines():
self._process_line(line.strip())
retry_count = 0
else: else:
time.sleep(self.check_interval) time.sleep(self.check_interval)
retry_count = 0 # 成功时重置重试次数 retry_count = 0
except (paramiko.SSHException, paramiko.socket.error, OSError) as e: except (paramiko.SSHException, socket.error, OSError) as e:
logging.warning(f"SSH 断开,准备重连... 错误: {e}") logging.warning(f"SSH 断开,准备重连... 错误: {e}")
self.restart_monitoring() self.restart_monitoring()
...@@ -118,28 +127,26 @@ class LogMonitor: ...@@ -118,28 +127,26 @@ class LogMonitor:
logging.error("达到最大重试次数,停止监控") logging.error("达到最大重试次数,停止监控")
self.stop_monitoring() self.stop_monitoring()
return return
time.sleep(min(5 * retry_count, 60)) # 指数退避 time.sleep(min(5 * retry_count, 60))
def save_error_contexts_to_json(self): def save_error_contexts_to_json(self):
# 获取当前脚本所在目录 try:
current_dir = os.path.dirname(os.path.abspath(__file__)) current_file = __file__
except NameError:
import inspect
current_file = inspect.getframeinfo(inspect.currentframe()).filename
# 构建正确的相对路径(指向 ubains-module-test 根目录) current_dir = os.path.dirname(os.path.abspath(current_file))
root_dir = os.path.dirname(current_dir) # 返回到 "日志监测" root_dir = os.path.dirname(current_dir)
root_dir = os.path.dirname(root_dir) # 返回到 "ubains-module-test" root_dir = os.path.dirname(root_dir)
# 拼接目标路径
full_path = os.path.normpath(os.path.join(root_dir, "日志监测", "error_log")) full_path = os.path.normpath(os.path.join(root_dir, "日志监测", "error_log"))
# 确保目录存在
os.makedirs(full_path, exist_ok=True) os.makedirs(full_path, exist_ok=True)
# 文件名带时间戳
timestamp = time.strftime("%Y-%m-%d-%H:%M") timestamp = time.strftime("%Y-%m-%d-%H:%M")
filename = f"error_log{timestamp}.json" filename = f"error_log{timestamp}.json"
file_path = os.path.join(full_path, filename) file_path = os.path.join(full_path, filename)
# 写入 JSON 文件
try: try:
with open(file_path, 'w', encoding='utf-8') as f: with open(file_path, 'w', encoding='utf-8') as f:
json.dump(self.error_contexts, f, ensure_ascii=False, indent=4) json.dump(self.error_contexts, f, ensure_ascii=False, indent=4)
...@@ -150,20 +157,11 @@ class LogMonitor: ...@@ -150,20 +157,11 @@ class LogMonitor:
return None return None
def generate_error_log_url(self, file_path): def generate_error_log_url(self, file_path):
# 根据本地存储路径,生成公网访问链接。
# 示例:
# file_path = D:\GithubData\自动化\ubains-module-test\预定系统\reports\error_log\error_log2025-06-05-20:15.json
# url = http://nat.ubainsyun.com:31133/error_log/error_log2025-06-05-20:15.json
if not file_path: if not file_path:
return None return None
# 从文件路径中提取最后一级目录名(error_log)和文件名
dir_name = os.path.basename(os.path.dirname(file_path)) # 获取 error_log
filename = os.path.basename(file_path) filename = os.path.basename(file_path)
error_log_url = f"http://nat.ubainsyun.com:32233/{filename}"
# 拼接 URL
error_log_url = f"http://nat.ubainsyun.com:31133/{dir_name}/{filename}"
logging.info(f"生成公网访问链接: {error_log_url}") logging.info(f"生成公网访问链接: {error_log_url}")
return error_log_url return error_log_url
...@@ -173,12 +171,11 @@ class LogMonitor: ...@@ -173,12 +171,11 @@ class LogMonitor:
if len(self.line_buffer) > self.buffer_size: if len(self.line_buffer) > self.buffer_size:
self.line_buffer.pop(0) self.line_buffer.pop(0)
# 提取日志级别字段(如 INFO / ERROR)
try: try:
level_part = line.split(" : ")[0] # 取 "时间戳 LEVEL" 部分 level_part = line.split(" : ")[0]
level = level_part.split()[-1] # 取最后一个词作为日志级别 level = level_part.split()[-1]
if level in ["ERROR", "Exception"]: if any(keyword in line.upper() for keyword in ["ERROR"]):
logging.info(f"发现 {level} 日志!正在通过 SSH 获取上下文日志...") logging.info(f"发现 {level} 日志!正在通过 SSH 获取上下文日志...")
full_log = self.get_remote_log_with_paramiko( full_log = self.get_remote_log_with_paramiko(
...@@ -187,7 +184,7 @@ class LogMonitor: ...@@ -187,7 +184,7 @@ class LogMonitor:
private_key_path=self.private_key_path, private_key_path=self.private_key_path,
passphrase=self.passphrase, passphrase=self.passphrase,
log_path=self.log_path, log_path=self.log_path,
num_lines=200 num_lines=500
) )
if full_log: if full_log:
...@@ -198,7 +195,6 @@ class LogMonitor: ...@@ -198,7 +195,6 @@ class LogMonitor:
end = min(len(lines), i + 101) end = min(len(lines), i + 101)
context = lines[start:end] context = lines[start:end]
# 将上下文日志保存到 error_contexts 中
with self.lock: with self.lock:
self.error_contexts.append({ self.error_contexts.append({
'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'), 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
...@@ -206,13 +202,20 @@ class LogMonitor: ...@@ -206,13 +202,20 @@ class LogMonitor:
'context': context 'context': context
}) })
# 保存为 JSON 并生成公网链接
file_path = self.save_error_contexts_to_json() file_path = self.save_error_contexts_to_json()
error_log_url = self.generate_error_log_url(file_path) error_log_url = self.generate_error_log_url(file_path)
# 调用钉钉发送函数 error_hash = hash(line.strip())
current_time = time.time()
if error_hash in self.sent_errors:
if current_time - self.sent_errors[error_hash] < self.resend_interval:
logging.info(f"该错误已在冷却期内,跳过重复发送:{line[:100]}...")
break
try: try:
dingding_send_message(error_log_url, ding_type=self.ding_type) dingding_send_message(error_log_url, ding_type=self.ding_type)
self.sent_errors[error_hash] = current_time
except Exception as e: except Exception as e:
logging.info(f"发送钉钉消息失败: {e}") logging.info(f"发送钉钉消息失败: {e}")
...@@ -220,24 +223,23 @@ class LogMonitor: ...@@ -220,24 +223,23 @@ class LogMonitor:
break break
else: else:
logging.error("获取日志失败,无法获取上下文") logging.error("获取日志失败,无法获取上下文")
logging.debug("Received line: %s", line)
except IndexError: except IndexError:
pass pass
except Exception as e: except Exception as e:
logging.exception(f"获取上下文日志失败: {e}") logging.exception(f"获取上下文日志失败: {e}")
def restart_monitoring(self): def restart_monitoring(self):
"""自动重启日志监控"""
logging.info("尝试重新启动日志监控...") logging.info("尝试重新启动日志监控...")
self.stop_monitoring() self.stop_monitoring()
time.sleep(5) time.sleep(5)
self.start_monitoring() self.start_monitoring()
@staticmethod @staticmethod
def get_remote_log_with_paramiko(host, username, private_key_path, passphrase, log_path, num_lines=1000, timeout=30, def get_remote_log_with_paramiko(host, username, private_key_path, passphrase,
filter_word=None): log_path, num_lines=1000, timeout=30):
"""
使用 Paramiko 获取远程服务器的日志文件内容,并通过过滤词过滤日志内容.
"""
try: try:
private_key = paramiko.RSAKey.from_private_key_file(private_key_path, password=passphrase) private_key = paramiko.RSAKey.from_private_key_file(private_key_path, password=passphrase)
client = paramiko.SSHClient() client = paramiko.SSHClient()
...@@ -270,7 +272,6 @@ class LogMonitor: ...@@ -270,7 +272,6 @@ class LogMonitor:
if __name__ == "__main__": if __name__ == "__main__":
# 多个服务器配置
SERVERS = [ SERVERS = [
{ {
"host": "192.168.5.235", "host": "192.168.5.235",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论