mirror of
https://github.com/deepmodeling/Uni-Lab-OS
synced 2026-03-28 01:55:52 +00:00
v0.10.19
fast registry load minor fix on skill & registry stripe ros2 schema desc add create-device-skill new registry system backwards to yaml remove not exist resource new registry sys exp. support with add device add ai conventions correct raise create resource error ret info fix revert ret info fix fix prcxi check add create_resource schema re signal host ready event add websocket connection timeout and improve reconnection logic add open_timeout parameter to websocket connection add TimeoutError and InvalidStatus exception handling implement exponential backoff for reconnection attempts simplify reconnection logic flow add gzip change pose extra to any add isFlapY
This commit is contained in:
@@ -26,6 +26,7 @@ from enum import Enum
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
from unilabos.app.model import JobAddReq
|
||||
from unilabos.resources.resource_tracker import ResourceDictType
|
||||
from unilabos.ros.nodes.presets.host_node import HostNode
|
||||
from unilabos.utils.type_check import serialize_result_info
|
||||
from unilabos.app.communication import BaseCommunicationClient
|
||||
@@ -408,6 +409,7 @@ class MessageProcessor:
|
||||
# 线程控制
|
||||
self.is_running = False
|
||||
self.thread = None
|
||||
self._loop = None # asyncio event loop引用,用于外部关闭websocket
|
||||
self.reconnect_count = 0
|
||||
|
||||
logger.info(f"[MessageProcessor] Initialized for URL: {websocket_url}")
|
||||
@@ -434,22 +436,31 @@ class MessageProcessor:
|
||||
def stop(self) -> None:
|
||||
"""停止消息处理线程"""
|
||||
self.is_running = False
|
||||
# 主动关闭websocket以快速中断消息接收循环
|
||||
ws = self.websocket
|
||||
loop = self._loop
|
||||
if ws and loop and loop.is_running():
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(ws.close(), loop)
|
||||
except Exception:
|
||||
pass
|
||||
if self.thread and self.thread.is_alive():
|
||||
self.thread.join(timeout=2)
|
||||
logger.info("[MessageProcessor] Stopped")
|
||||
|
||||
def _run(self):
|
||||
"""运行消息处理主循环"""
|
||||
loop = asyncio.new_event_loop()
|
||||
self._loop = asyncio.new_event_loop()
|
||||
try:
|
||||
asyncio.set_event_loop(loop)
|
||||
loop.run_until_complete(self._connection_handler())
|
||||
asyncio.set_event_loop(self._loop)
|
||||
self._loop.run_until_complete(self._connection_handler())
|
||||
except Exception as e:
|
||||
logger.error(f"[MessageProcessor] Thread error: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
finally:
|
||||
if loop:
|
||||
loop.close()
|
||||
if self._loop:
|
||||
self._loop.close()
|
||||
self._loop = None
|
||||
|
||||
async def _connection_handler(self):
|
||||
"""处理WebSocket连接和重连逻辑"""
|
||||
@@ -466,8 +477,10 @@ class MessageProcessor:
|
||||
async with websockets.connect(
|
||||
self.websocket_url,
|
||||
ssl=ssl_context,
|
||||
open_timeout=20,
|
||||
ping_interval=WSConfig.ping_interval,
|
||||
ping_timeout=10,
|
||||
close_timeout=5,
|
||||
additional_headers={
|
||||
"Authorization": f"Lab {BasicConfig.auth_secret()}",
|
||||
"EdgeSession": f"{self.session_id}",
|
||||
@@ -478,85 +491,98 @@ class MessageProcessor:
|
||||
self.connected = True
|
||||
self.reconnect_count = 0
|
||||
|
||||
logger.info(f"[MessageProcessor] Connected to {self.websocket_url}")
|
||||
logger.info(f"[MessageProcessor] 已连接到 {self.websocket_url}")
|
||||
|
||||
# 启动发送协程
|
||||
send_task = asyncio.create_task(self._send_handler())
|
||||
send_task = asyncio.create_task(self._send_handler(), name="websocket-send_task")
|
||||
|
||||
# 每次连接(含重连)后重新向服务端注册,
|
||||
# 否则服务端不知道客户端已上线,不会推送消息。
|
||||
if self.websocket_client:
|
||||
self.websocket_client.publish_host_ready()
|
||||
|
||||
try:
|
||||
# 接收消息循环
|
||||
await self._message_handler()
|
||||
finally:
|
||||
# 必须在 async with __aexit__ 之前停止 send_task,
|
||||
# 否则 send_task 会在关闭握手期间继续发送数据,
|
||||
# 干扰 websockets 库的内部清理,导致 task 泄漏。
|
||||
self.connected = False
|
||||
send_task.cancel()
|
||||
try:
|
||||
await send_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self.connected = False
|
||||
|
||||
except websockets.exceptions.ConnectionClosed:
|
||||
logger.warning("[MessageProcessor] Connection closed")
|
||||
self.connected = False
|
||||
logger.warning("[MessageProcessor] 与服务端连接中断")
|
||||
except TimeoutError:
|
||||
logger.warning(
|
||||
f"[MessageProcessor] 与服务端连接通信超时 (已尝试 {self.reconnect_count + 1} 次),请检查您的网络状况"
|
||||
)
|
||||
except websockets.exceptions.InvalidStatus as e:
|
||||
logger.warning(
|
||||
f"[MessageProcessor] 收到服务端注册码 {e.response.status_code}, 上一进程可能还未退出"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"[MessageProcessor] Connection error: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
self.connected = False
|
||||
logger.error(f"[MessageProcessor] 尝试重连时出错 {str(e)}")
|
||||
finally:
|
||||
self.connected = False
|
||||
self.websocket = None
|
||||
|
||||
# 重连逻辑
|
||||
if self.is_running and self.reconnect_count < WSConfig.max_reconnect_attempts:
|
||||
if not self.is_running:
|
||||
break
|
||||
if self.reconnect_count < WSConfig.max_reconnect_attempts:
|
||||
self.reconnect_count += 1
|
||||
backoff = WSConfig.reconnect_interval
|
||||
logger.info(
|
||||
f"[MessageProcessor] Reconnecting in {WSConfig.reconnect_interval}s "
|
||||
f"(attempt {self.reconnect_count}/{WSConfig.max_reconnect_attempts})"
|
||||
f"[MessageProcessor] 即将在 {backoff} 秒后重连 (已尝试 {self.reconnect_count}/{WSConfig.max_reconnect_attempts})"
|
||||
)
|
||||
await asyncio.sleep(WSConfig.reconnect_interval)
|
||||
elif self.reconnect_count >= WSConfig.max_reconnect_attempts:
|
||||
await asyncio.sleep(backoff)
|
||||
else:
|
||||
logger.error("[MessageProcessor] Max reconnection attempts reached")
|
||||
break
|
||||
else:
|
||||
self.reconnect_count -= 1
|
||||
|
||||
async def _message_handler(self):
|
||||
"""处理接收到的消息"""
|
||||
"""处理接收到的消息。
|
||||
|
||||
ConnectionClosed 不在此处捕获,让其向上传播到 _connection_handler,
|
||||
以便 async with websockets.connect() 的 __aexit__ 能感知连接已断,
|
||||
正确清理内部 task,避免 task 泄漏。
|
||||
"""
|
||||
if not self.websocket:
|
||||
logger.error("[MessageProcessor] WebSocket connection is None")
|
||||
return
|
||||
|
||||
try:
|
||||
async for message in self.websocket:
|
||||
try:
|
||||
data = json.loads(message)
|
||||
message_type = data.get("action", "")
|
||||
message_data = data.get("data")
|
||||
if self.session_id and self.session_id == data.get("edge_session"):
|
||||
await self._process_message(message_type, message_data)
|
||||
async for message in self.websocket:
|
||||
try:
|
||||
data = json.loads(message)
|
||||
message_type = data.get("action", "")
|
||||
message_data = data.get("data")
|
||||
if self.session_id and self.session_id == data.get("edge_session"):
|
||||
await self._process_message(message_type, message_data)
|
||||
else:
|
||||
if message_type.endswith("_material"):
|
||||
logger.trace(
|
||||
f"[MessageProcessor] 收到一条归属 {data.get('edge_session')} 的旧消息:{data}"
|
||||
)
|
||||
logger.debug(
|
||||
f"[MessageProcessor] 跳过了一条归属 {data.get('edge_session')} 的旧消息: {data.get('action')}"
|
||||
)
|
||||
else:
|
||||
if message_type.endswith("_material"):
|
||||
logger.trace(
|
||||
f"[MessageProcessor] 收到一条归属 {data.get('edge_session')} 的旧消息:{data}"
|
||||
)
|
||||
logger.debug(
|
||||
f"[MessageProcessor] 跳过了一条归属 {data.get('edge_session')} 的旧消息: {data.get('action')}"
|
||||
)
|
||||
else:
|
||||
await self._process_message(message_type, message_data)
|
||||
except json.JSONDecodeError:
|
||||
logger.error(f"[MessageProcessor] Invalid JSON received: {message}")
|
||||
except Exception as e:
|
||||
logger.error(f"[MessageProcessor] Error processing message: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
except websockets.exceptions.ConnectionClosed:
|
||||
logger.info("[MessageProcessor] Message handler stopped - connection closed")
|
||||
except Exception as e:
|
||||
logger.error(f"[MessageProcessor] Message handler error: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
await self._process_message(message_type, message_data)
|
||||
except json.JSONDecodeError:
|
||||
logger.error(f"[MessageProcessor] Invalid JSON received: {message}")
|
||||
except Exception as e:
|
||||
logger.error(f"[MessageProcessor] Error processing message: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
async def _send_handler(self):
|
||||
"""处理发送队列中的消息"""
|
||||
logger.debug("[MessageProcessor] Send handler started")
|
||||
logger.trace("[MessageProcessor] Send handler started")
|
||||
|
||||
try:
|
||||
while self.connected and self.websocket:
|
||||
@@ -601,6 +627,7 @@ class MessageProcessor:
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("[MessageProcessor] Send handler cancelled")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"[MessageProcessor] Fatal error in send handler: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
@@ -632,6 +659,10 @@ class MessageProcessor:
|
||||
# elif message_type == "session_id":
|
||||
# self.session_id = message_data.get("session_id")
|
||||
# logger.info(f"[MessageProcessor] Session ID: {self.session_id}")
|
||||
elif message_type == "add_device":
|
||||
await self._handle_device_manage(message_data, "add")
|
||||
elif message_type == "remove_device":
|
||||
await self._handle_device_manage(message_data, "remove")
|
||||
elif message_type == "request_restart":
|
||||
await self._handle_request_restart(message_data)
|
||||
else:
|
||||
@@ -968,6 +999,37 @@ class MessageProcessor:
|
||||
)
|
||||
thread.start()
|
||||
|
||||
async def _handle_device_manage(self, device_list: list[ResourceDictType], action: str):
|
||||
"""Handle add_device / remove_device from LabGo server."""
|
||||
if not device_list:
|
||||
return
|
||||
|
||||
for item in device_list:
|
||||
target_node_id = item.get("target_node_id", "host_node")
|
||||
|
||||
def _notify(target_id: str, act: str, cfg: ResourceDictType):
|
||||
try:
|
||||
host_node = HostNode.get_instance(timeout=5)
|
||||
if not host_node:
|
||||
logger.error(f"[DeviceManage] HostNode not available for {act}_device")
|
||||
return
|
||||
success = host_node.notify_device_manage(target_id, act, cfg)
|
||||
if success:
|
||||
logger.info(f"[DeviceManage] {act}_device completed on {target_id}")
|
||||
else:
|
||||
logger.warning(f"[DeviceManage] {act}_device failed on {target_id}")
|
||||
except Exception as e:
|
||||
logger.error(f"[DeviceManage] Error in {act}_device: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
thread = threading.Thread(
|
||||
target=_notify,
|
||||
args=(target_node_id, action, item),
|
||||
daemon=True,
|
||||
name=f"DeviceManage-{action}-{item.get('id', '')}",
|
||||
)
|
||||
thread.start()
|
||||
|
||||
async def _handle_request_restart(self, data: Dict[str, Any]):
|
||||
"""
|
||||
处理重启请求
|
||||
@@ -979,10 +1041,9 @@ class MessageProcessor:
|
||||
logger.info(f"[MessageProcessor] Received restart request, reason: {reason}, delay: {delay}s")
|
||||
|
||||
# 发送确认消息
|
||||
if self.websocket_client:
|
||||
await self.websocket_client.send_message(
|
||||
{"action": "restart_acknowledged", "data": {"reason": reason, "delay": delay}}
|
||||
)
|
||||
self.send_message(
|
||||
{"action": "restart_acknowledged", "data": {"reason": reason, "delay": delay}}
|
||||
)
|
||||
|
||||
# 设置全局重启标志
|
||||
import unilabos.app.main as main_module
|
||||
@@ -1084,13 +1145,14 @@ class QueueProcessor:
|
||||
def stop(self) -> None:
|
||||
"""停止队列处理线程"""
|
||||
self.is_running = False
|
||||
self.queue_update_event.set() # 立即唤醒等待中的线程
|
||||
if self.thread and self.thread.is_alive():
|
||||
self.thread.join(timeout=2)
|
||||
logger.info("[QueueProcessor] Stopped")
|
||||
|
||||
def _run(self):
|
||||
"""运行队列处理主循环"""
|
||||
logger.debug("[QueueProcessor] Queue processor started")
|
||||
logger.trace("[QueueProcessor] Queue processor started")
|
||||
|
||||
while self.is_running:
|
||||
try:
|
||||
@@ -1305,7 +1367,6 @@ class WebSocketClient(BaseCommunicationClient):
|
||||
else:
|
||||
url = f"{scheme}://{parsed.netloc}/api/v1/ws/schedule"
|
||||
|
||||
logger.debug(f"[WebSocketClient] URL: {url}")
|
||||
return url
|
||||
|
||||
def start(self) -> None:
|
||||
@@ -1318,13 +1379,11 @@ class WebSocketClient(BaseCommunicationClient):
|
||||
logger.error("[WebSocketClient] WebSocket URL not configured")
|
||||
return
|
||||
|
||||
logger.info(f"[WebSocketClient] Starting connection to {self.websocket_url}")
|
||||
|
||||
# 启动两个核心线程
|
||||
self.message_processor.start()
|
||||
self.queue_processor.start()
|
||||
|
||||
logger.info("[WebSocketClient] All threads started")
|
||||
logger.trace("[WebSocketClient] All threads started")
|
||||
|
||||
def stop(self) -> None:
|
||||
"""停止WebSocket客户端"""
|
||||
@@ -1340,8 +1399,8 @@ class WebSocketClient(BaseCommunicationClient):
|
||||
message = {"action": "normal_exit", "data": {"session_id": session_id}}
|
||||
self.message_processor.send_message(message)
|
||||
logger.info(f"[WebSocketClient] Sent normal_exit message with session_id: {session_id}")
|
||||
# 给一点时间让消息发送出去
|
||||
time.sleep(1)
|
||||
# send_handler 每100ms检查一次队列,等300ms足以让消息发出
|
||||
time.sleep(0.3)
|
||||
except Exception as e:
|
||||
logger.warning(f"[WebSocketClient] Failed to send normal_exit message: {str(e)}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user