mirror of
https://github.com/deepmodeling/Uni-Lab-OS
synced 2026-03-26 08:53:06 +00:00
Merge branch 'dev' into prcix9320
This commit is contained in:
@@ -23,9 +23,10 @@ from typing import Optional, Dict, Any, List
|
||||
from urllib.parse import urlparse
|
||||
from enum import Enum
|
||||
|
||||
from jedi.inference.gradual.typing import TypedDict
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
from unilabos.app.model import JobAddReq
|
||||
from unilabos.resources.resource_tracker import ResourceDictType
|
||||
from unilabos.ros.nodes.presets.host_node import HostNode
|
||||
from unilabos.utils.type_check import serialize_result_info
|
||||
from unilabos.app.communication import BaseCommunicationClient
|
||||
@@ -164,7 +165,7 @@ class DeviceActionManager:
|
||||
job_info.set_ready_timeout(10) # 设置10秒超时
|
||||
self.active_jobs[device_key] = job_info
|
||||
job_log = format_job_log(job_info.job_id, job_info.task_id, job_info.device_id, job_info.action_name)
|
||||
logger.info(f"[DeviceActionManager] Job {job_log} can start immediately for {device_key}")
|
||||
logger.trace(f"[DeviceActionManager] Job {job_log} can start immediately for {device_key}")
|
||||
return True
|
||||
|
||||
def start_job(self, job_id: str) -> bool:
|
||||
@@ -231,8 +232,9 @@ class DeviceActionManager:
|
||||
job_info.update_timestamp()
|
||||
# 从all_jobs中移除已结束的job
|
||||
del self.all_jobs[job_id]
|
||||
job_log = format_job_log(job_info.job_id, job_info.task_id, job_info.device_id, job_info.action_name)
|
||||
logger.info(f"[DeviceActionManager] Job {job_log} ended for {device_key}")
|
||||
# job_log = format_job_log(job_info.job_id, job_info.task_id, job_info.device_id, job_info.action_name)
|
||||
# logger.debug(f"[DeviceActionManager] Job {job_log} ended for {device_key}")
|
||||
pass
|
||||
else:
|
||||
job_log = format_job_log(job_info.job_id, job_info.task_id, job_info.device_id, job_info.action_name)
|
||||
logger.warning(f"[DeviceActionManager] Job {job_log} was not active for {device_key}")
|
||||
@@ -248,7 +250,7 @@ class DeviceActionManager:
|
||||
next_job_log = format_job_log(
|
||||
next_job.job_id, next_job.task_id, next_job.device_id, next_job.action_name
|
||||
)
|
||||
logger.info(f"[DeviceActionManager] Next job {next_job_log} can start for {device_key}")
|
||||
logger.trace(f"[DeviceActionManager] Next job {next_job_log} can start for {device_key}")
|
||||
return next_job
|
||||
|
||||
return None
|
||||
@@ -302,7 +304,7 @@ class DeviceActionManager:
|
||||
# 从all_jobs中移除
|
||||
del self.all_jobs[job_id]
|
||||
job_log = format_job_log(job_info.job_id, job_info.task_id, job_info.device_id, job_info.action_name)
|
||||
logger.info(f"[DeviceActionManager] Active job {job_log} cancelled for {device_key}")
|
||||
logger.trace(f"[DeviceActionManager] Active job {job_log} cancelled for {device_key}")
|
||||
|
||||
# 启动下一个任务
|
||||
if device_key in self.device_queues and self.device_queues[device_key]:
|
||||
@@ -315,7 +317,7 @@ class DeviceActionManager:
|
||||
next_job_log = format_job_log(
|
||||
next_job.job_id, next_job.task_id, next_job.device_id, next_job.action_name
|
||||
)
|
||||
logger.info(f"[DeviceActionManager] Next job {next_job_log} can start after cancel")
|
||||
logger.trace(f"[DeviceActionManager] Next job {next_job_log} can start after cancel")
|
||||
return True
|
||||
|
||||
# 如果是排队中的任务
|
||||
@@ -329,7 +331,7 @@ class DeviceActionManager:
|
||||
job_log = format_job_log(
|
||||
job_info.job_id, job_info.task_id, job_info.device_id, job_info.action_name
|
||||
)
|
||||
logger.info(f"[DeviceActionManager] Queued job {job_log} cancelled for {device_key}")
|
||||
logger.trace(f"[DeviceActionManager] Queued job {job_log} cancelled for {device_key}")
|
||||
return True
|
||||
|
||||
job_log = format_job_log(job_info.job_id, job_info.task_id, job_info.device_id, job_info.action_name)
|
||||
@@ -407,6 +409,7 @@ class MessageProcessor:
|
||||
# 线程控制
|
||||
self.is_running = False
|
||||
self.thread = None
|
||||
self._loop = None # asyncio event loop引用,用于外部关闭websocket
|
||||
self.reconnect_count = 0
|
||||
|
||||
logger.info(f"[MessageProcessor] Initialized for URL: {websocket_url}")
|
||||
@@ -433,22 +436,31 @@ class MessageProcessor:
|
||||
def stop(self) -> None:
|
||||
"""停止消息处理线程"""
|
||||
self.is_running = False
|
||||
# 主动关闭websocket以快速中断消息接收循环
|
||||
ws = self.websocket
|
||||
loop = self._loop
|
||||
if ws and loop and loop.is_running():
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(ws.close(), loop)
|
||||
except Exception:
|
||||
pass
|
||||
if self.thread and self.thread.is_alive():
|
||||
self.thread.join(timeout=2)
|
||||
logger.info("[MessageProcessor] Stopped")
|
||||
|
||||
def _run(self):
|
||||
"""运行消息处理主循环"""
|
||||
loop = asyncio.new_event_loop()
|
||||
self._loop = asyncio.new_event_loop()
|
||||
try:
|
||||
asyncio.set_event_loop(loop)
|
||||
loop.run_until_complete(self._connection_handler())
|
||||
asyncio.set_event_loop(self._loop)
|
||||
self._loop.run_until_complete(self._connection_handler())
|
||||
except Exception as e:
|
||||
logger.error(f"[MessageProcessor] Thread error: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
finally:
|
||||
if loop:
|
||||
loop.close()
|
||||
if self._loop:
|
||||
self._loop.close()
|
||||
self._loop = None
|
||||
|
||||
async def _connection_handler(self):
|
||||
"""处理WebSocket连接和重连逻辑"""
|
||||
@@ -465,8 +477,10 @@ class MessageProcessor:
|
||||
async with websockets.connect(
|
||||
self.websocket_url,
|
||||
ssl=ssl_context,
|
||||
open_timeout=20,
|
||||
ping_interval=WSConfig.ping_interval,
|
||||
ping_timeout=10,
|
||||
close_timeout=5,
|
||||
additional_headers={
|
||||
"Authorization": f"Lab {BasicConfig.auth_secret()}",
|
||||
"EdgeSession": f"{self.session_id}",
|
||||
@@ -477,77 +491,94 @@ class MessageProcessor:
|
||||
self.connected = True
|
||||
self.reconnect_count = 0
|
||||
|
||||
logger.trace(f"[MessageProcessor] Connected to {self.websocket_url}")
|
||||
logger.info(f"[MessageProcessor] 已连接到 {self.websocket_url}")
|
||||
|
||||
# 启动发送协程
|
||||
send_task = asyncio.create_task(self._send_handler())
|
||||
send_task = asyncio.create_task(self._send_handler(), name="websocket-send_task")
|
||||
|
||||
# 每次连接(含重连)后重新向服务端注册,
|
||||
# 否则服务端不知道客户端已上线,不会推送消息。
|
||||
if self.websocket_client:
|
||||
self.websocket_client.publish_host_ready()
|
||||
|
||||
try:
|
||||
# 接收消息循环
|
||||
await self._message_handler()
|
||||
finally:
|
||||
# 必须在 async with __aexit__ 之前停止 send_task,
|
||||
# 否则 send_task 会在关闭握手期间继续发送数据,
|
||||
# 干扰 websockets 库的内部清理,导致 task 泄漏。
|
||||
self.connected = False
|
||||
send_task.cancel()
|
||||
try:
|
||||
await send_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self.connected = False
|
||||
|
||||
except websockets.exceptions.ConnectionClosed:
|
||||
logger.warning("[MessageProcessor] Connection closed")
|
||||
self.connected = False
|
||||
logger.warning("[MessageProcessor] 与服务端连接中断")
|
||||
except TimeoutError:
|
||||
logger.warning(
|
||||
f"[MessageProcessor] 与服务端连接通信超时 (已尝试 {self.reconnect_count + 1} 次),请检查您的网络状况"
|
||||
)
|
||||
except websockets.exceptions.InvalidStatus as e:
|
||||
logger.warning(
|
||||
f"[MessageProcessor] 收到服务端注册码 {e.response.status_code}, 上一进程可能还未退出"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"[MessageProcessor] Connection error: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
self.connected = False
|
||||
logger.error(f"[MessageProcessor] 尝试重连时出错 {str(e)}")
|
||||
finally:
|
||||
self.connected = False
|
||||
self.websocket = None
|
||||
|
||||
# 重连逻辑
|
||||
if self.is_running and self.reconnect_count < WSConfig.max_reconnect_attempts:
|
||||
if not self.is_running:
|
||||
break
|
||||
if self.reconnect_count < WSConfig.max_reconnect_attempts:
|
||||
self.reconnect_count += 1
|
||||
backoff = WSConfig.reconnect_interval
|
||||
logger.info(
|
||||
f"[MessageProcessor] Reconnecting in {WSConfig.reconnect_interval}s "
|
||||
f"(attempt {self.reconnect_count}/{WSConfig.max_reconnect_attempts})"
|
||||
f"[MessageProcessor] 即将在 {backoff} 秒后重连 (已尝试 {self.reconnect_count}/{WSConfig.max_reconnect_attempts})"
|
||||
)
|
||||
await asyncio.sleep(WSConfig.reconnect_interval)
|
||||
elif self.reconnect_count >= WSConfig.max_reconnect_attempts:
|
||||
await asyncio.sleep(backoff)
|
||||
else:
|
||||
logger.error("[MessageProcessor] Max reconnection attempts reached")
|
||||
break
|
||||
else:
|
||||
self.reconnect_count -= 1
|
||||
|
||||
async def _message_handler(self):
|
||||
"""处理接收到的消息"""
|
||||
"""处理接收到的消息。
|
||||
|
||||
ConnectionClosed 不在此处捕获,让其向上传播到 _connection_handler,
|
||||
以便 async with websockets.connect() 的 __aexit__ 能感知连接已断,
|
||||
正确清理内部 task,避免 task 泄漏。
|
||||
"""
|
||||
if not self.websocket:
|
||||
logger.error("[MessageProcessor] WebSocket connection is None")
|
||||
return
|
||||
|
||||
try:
|
||||
async for message in self.websocket:
|
||||
try:
|
||||
data = json.loads(message)
|
||||
message_type = data.get("action", "")
|
||||
message_data = data.get("data")
|
||||
if self.session_id and self.session_id == data.get("edge_session"):
|
||||
await self._process_message(message_type, message_data)
|
||||
async for message in self.websocket:
|
||||
try:
|
||||
data = json.loads(message)
|
||||
message_type = data.get("action", "")
|
||||
message_data = data.get("data")
|
||||
if self.session_id and self.session_id == data.get("edge_session"):
|
||||
await self._process_message(message_type, message_data)
|
||||
else:
|
||||
if message_type.endswith("_material"):
|
||||
logger.trace(
|
||||
f"[MessageProcessor] 收到一条归属 {data.get('edge_session')} 的旧消息:{data}"
|
||||
)
|
||||
logger.debug(
|
||||
f"[MessageProcessor] 跳过了一条归属 {data.get('edge_session')} 的旧消息: {data.get('action')}"
|
||||
)
|
||||
else:
|
||||
if message_type.endswith("_material"):
|
||||
logger.trace(f"[MessageProcessor] 收到一条归属 {data.get('edge_session')} 的旧消息:{data}")
|
||||
logger.debug(f"[MessageProcessor] 跳过了一条归属 {data.get('edge_session')} 的旧消息: {data.get('action')}")
|
||||
else:
|
||||
await self._process_message(message_type, message_data)
|
||||
except json.JSONDecodeError:
|
||||
logger.error(f"[MessageProcessor] Invalid JSON received: {message}")
|
||||
except Exception as e:
|
||||
logger.error(f"[MessageProcessor] Error processing message: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
except websockets.exceptions.ConnectionClosed:
|
||||
logger.info("[MessageProcessor] Message handler stopped - connection closed")
|
||||
except Exception as e:
|
||||
logger.error(f"[MessageProcessor] Message handler error: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
await self._process_message(message_type, message_data)
|
||||
except json.JSONDecodeError:
|
||||
logger.error(f"[MessageProcessor] Invalid JSON received: {message}")
|
||||
except Exception as e:
|
||||
logger.error(f"[MessageProcessor] Error processing message: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
async def _send_handler(self):
|
||||
"""处理发送队列中的消息"""
|
||||
@@ -596,6 +627,7 @@ class MessageProcessor:
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("[MessageProcessor] Send handler cancelled")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"[MessageProcessor] Fatal error in send handler: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
@@ -604,7 +636,7 @@ class MessageProcessor:
|
||||
|
||||
async def _process_message(self, message_type: str, message_data: Dict[str, Any]):
|
||||
"""处理收到的消息"""
|
||||
logger.debug(f"[MessageProcessor] Processing message: {message_type}")
|
||||
logger.trace(f"[MessageProcessor] Processing message: {message_type}")
|
||||
|
||||
try:
|
||||
if message_type == "pong":
|
||||
@@ -627,6 +659,10 @@ class MessageProcessor:
|
||||
# elif message_type == "session_id":
|
||||
# self.session_id = message_data.get("session_id")
|
||||
# logger.info(f"[MessageProcessor] Session ID: {self.session_id}")
|
||||
elif message_type == "add_device":
|
||||
await self._handle_device_manage(message_data, "add")
|
||||
elif message_type == "remove_device":
|
||||
await self._handle_device_manage(message_data, "remove")
|
||||
elif message_type == "request_restart":
|
||||
await self._handle_request_restart(message_data)
|
||||
else:
|
||||
@@ -698,13 +734,13 @@ class MessageProcessor:
|
||||
await self._send_action_state_response(
|
||||
device_id, action_name, task_id, job_id, "query_action_status", True, 0
|
||||
)
|
||||
logger.info(f"[MessageProcessor] Job {job_log} can start immediately")
|
||||
logger.trace(f"[MessageProcessor] Job {job_log} can start immediately")
|
||||
else:
|
||||
# 需要排队
|
||||
await self._send_action_state_response(
|
||||
device_id, action_name, task_id, job_id, "query_action_status", False, 10
|
||||
)
|
||||
logger.info(f"[MessageProcessor] Job {job_log} queued")
|
||||
logger.trace(f"[MessageProcessor] Job {job_log} queued")
|
||||
|
||||
# 通知QueueProcessor有新的队列更新
|
||||
if self.queue_processor:
|
||||
@@ -718,6 +754,32 @@ class MessageProcessor:
|
||||
req = JobAddReq(**data)
|
||||
|
||||
job_log = format_job_log(req.job_id, req.task_id, req.device_id, req.action)
|
||||
|
||||
# 服务端对always_free动作可能跳过query_action_state直接发job_start,
|
||||
# 此时job尚未注册,需要自动补注册
|
||||
existing_job = self.device_manager.get_job_info(req.job_id)
|
||||
if not existing_job:
|
||||
action_name = req.action
|
||||
device_action_key = f"/devices/{req.device_id}/{action_name}"
|
||||
action_always_free = self._check_action_always_free(req.device_id, action_name)
|
||||
|
||||
if action_always_free:
|
||||
job_info = JobInfo(
|
||||
job_id=req.job_id,
|
||||
task_id=req.task_id,
|
||||
device_id=req.device_id,
|
||||
action_name=action_name,
|
||||
device_action_key=device_action_key,
|
||||
status=JobStatus.QUEUE,
|
||||
start_time=time.time(),
|
||||
always_free=True,
|
||||
)
|
||||
self.device_manager.add_queue_request(job_info)
|
||||
logger.info(f"[MessageProcessor] Job {job_log} always_free, auto-registered from direct job_start")
|
||||
else:
|
||||
logger.error(f"[MessageProcessor] Job {job_log} not registered (missing query_action_state)")
|
||||
return
|
||||
|
||||
success = self.device_manager.start_job(req.job_id)
|
||||
if not success:
|
||||
logger.error(f"[MessageProcessor] Failed to start job {job_log}")
|
||||
@@ -911,9 +973,7 @@ class MessageProcessor:
|
||||
device_action_groups[key_add] = []
|
||||
device_action_groups[key_add].append(item["uuid"])
|
||||
|
||||
logger.info(
|
||||
f"[资源同步] 跨站Transfer: {item['uuid'][:8]} from {device_old_id} to {device_id}"
|
||||
)
|
||||
logger.info(f"[资源同步] 跨站Transfer: {item['uuid'][:8]} from {device_old_id} to {device_id}")
|
||||
else:
|
||||
# 正常update
|
||||
key = (device_id, "update")
|
||||
@@ -927,7 +987,9 @@ class MessageProcessor:
|
||||
device_action_groups[key] = []
|
||||
device_action_groups[key].append(item["uuid"])
|
||||
|
||||
logger.trace(f"[资源同步] 动作 {action} 分组数量: {len(device_action_groups)}, 总数量: {len(resource_uuid_list)}")
|
||||
logger.trace(
|
||||
f"[资源同步] 动作 {action} 分组数量: {len(device_action_groups)}, 总数量: {len(resource_uuid_list)}"
|
||||
)
|
||||
|
||||
# 为每个(device_id, action)创建独立的更新线程
|
||||
for (device_id, actual_action), items in device_action_groups.items():
|
||||
@@ -963,45 +1025,77 @@ class MessageProcessor:
|
||||
)
|
||||
thread.start()
|
||||
|
||||
async def _handle_device_manage(self, device_list: list[ResourceDictType], action: str):
|
||||
"""Handle add_device / remove_device from LabGo server."""
|
||||
if not device_list:
|
||||
return
|
||||
|
||||
for item in device_list:
|
||||
target_node_id = item.get("target_node_id", "host_node")
|
||||
|
||||
def _notify(target_id: str, act: str, cfg: ResourceDictType):
|
||||
try:
|
||||
host_node = HostNode.get_instance(timeout=5)
|
||||
if not host_node:
|
||||
logger.error(f"[DeviceManage] HostNode not available for {act}_device")
|
||||
return
|
||||
success = host_node.notify_device_manage(target_id, act, cfg)
|
||||
if success:
|
||||
logger.info(f"[DeviceManage] {act}_device completed on {target_id}")
|
||||
else:
|
||||
logger.warning(f"[DeviceManage] {act}_device failed on {target_id}")
|
||||
except Exception as e:
|
||||
logger.error(f"[DeviceManage] Error in {act}_device: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
thread = threading.Thread(
|
||||
target=_notify,
|
||||
args=(target_node_id, action, item),
|
||||
daemon=True,
|
||||
name=f"DeviceManage-{action}-{item.get('id', '')}",
|
||||
)
|
||||
thread.start()
|
||||
|
||||
async def _handle_request_restart(self, data: Dict[str, Any]):
|
||||
"""
|
||||
处理重启请求
|
||||
|
||||
|
||||
当LabGo发送request_restart时,执行清理并触发重启
|
||||
"""
|
||||
reason = data.get("reason", "unknown")
|
||||
delay = data.get("delay", 2) # 默认延迟2秒
|
||||
logger.info(f"[MessageProcessor] Received restart request, reason: {reason}, delay: {delay}s")
|
||||
|
||||
|
||||
# 发送确认消息
|
||||
if self.websocket_client:
|
||||
await self.websocket_client.send_message({
|
||||
"action": "restart_acknowledged",
|
||||
"data": {"reason": reason, "delay": delay}
|
||||
})
|
||||
|
||||
self.send_message(
|
||||
{"action": "restart_acknowledged", "data": {"reason": reason, "delay": delay}}
|
||||
)
|
||||
|
||||
# 设置全局重启标志
|
||||
import unilabos.app.main as main_module
|
||||
|
||||
main_module._restart_requested = True
|
||||
main_module._restart_reason = reason
|
||||
|
||||
|
||||
# 延迟后执行清理
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
|
||||
# 在新线程中执行清理,避免阻塞当前事件循环
|
||||
def do_cleanup():
|
||||
import time
|
||||
|
||||
time.sleep(0.5) # 给当前消息处理完成的时间
|
||||
logger.info(f"[MessageProcessor] Starting cleanup for restart, reason: {reason}")
|
||||
try:
|
||||
from unilabos.app.utils import cleanup_for_restart
|
||||
|
||||
if cleanup_for_restart():
|
||||
logger.info("[MessageProcessor] Cleanup successful, main() will restart")
|
||||
else:
|
||||
logger.error("[MessageProcessor] Cleanup failed")
|
||||
except Exception as e:
|
||||
logger.error(f"[MessageProcessor] Error during cleanup: {e}")
|
||||
|
||||
|
||||
cleanup_thread = threading.Thread(target=do_cleanup, name="RestartCleanupThread", daemon=True)
|
||||
cleanup_thread.start()
|
||||
logger.info(f"[MessageProcessor] Restart cleanup scheduled")
|
||||
@@ -1077,6 +1171,7 @@ class QueueProcessor:
|
||||
def stop(self) -> None:
|
||||
"""停止队列处理线程"""
|
||||
self.is_running = False
|
||||
self.queue_update_event.set() # 立即唤醒等待中的线程
|
||||
if self.thread and self.thread.is_alive():
|
||||
self.thread.join(timeout=2)
|
||||
logger.info("[QueueProcessor] Stopped")
|
||||
@@ -1197,7 +1292,7 @@ class QueueProcessor:
|
||||
success = self.message_processor.send_message(message)
|
||||
job_log = format_job_log(job_info.job_id, job_info.task_id, job_info.device_id, job_info.action_name)
|
||||
if success:
|
||||
logger.debug(f"[QueueProcessor] Sent busy/need_more for queued job {job_log}")
|
||||
logger.trace(f"[QueueProcessor] Sent busy/need_more for queued job {job_log}")
|
||||
else:
|
||||
logger.warning(f"[QueueProcessor] Failed to send busy status for job {job_log}")
|
||||
|
||||
@@ -1220,7 +1315,7 @@ class QueueProcessor:
|
||||
job_info.action_name,
|
||||
)
|
||||
|
||||
logger.info(f"[QueueProcessor] Job {job_log} completed with status: {status}")
|
||||
logger.trace(f"[QueueProcessor] Job {job_log} completed with status: {status}")
|
||||
|
||||
# 结束任务,获取下一个可执行的任务
|
||||
next_job = self.device_manager.end_job(job_id)
|
||||
@@ -1240,8 +1335,8 @@ class QueueProcessor:
|
||||
},
|
||||
}
|
||||
self.message_processor.send_message(message)
|
||||
next_job_log = format_job_log(next_job.job_id, next_job.task_id, next_job.device_id, next_job.action_name)
|
||||
logger.info(f"[QueueProcessor] Notified next job {next_job_log} can start")
|
||||
# next_job_log = format_job_log(next_job.job_id, next_job.task_id, next_job.device_id, next_job.action_name)
|
||||
# logger.debug(f"[QueueProcessor] Notified next job {next_job_log} can start")
|
||||
|
||||
# 立即触发下一轮状态检查
|
||||
self.notify_queue_update()
|
||||
@@ -1330,8 +1425,8 @@ class WebSocketClient(BaseCommunicationClient):
|
||||
message = {"action": "normal_exit", "data": {"session_id": session_id}}
|
||||
self.message_processor.send_message(message)
|
||||
logger.info(f"[WebSocketClient] Sent normal_exit message with session_id: {session_id}")
|
||||
# 给一点时间让消息发送出去
|
||||
time.sleep(1)
|
||||
# send_handler 每100ms检查一次队列,等300ms足以让消息发出
|
||||
time.sleep(0.3)
|
||||
except Exception as e:
|
||||
logger.warning(f"[WebSocketClient] Failed to send normal_exit message: {str(e)}")
|
||||
|
||||
@@ -1383,7 +1478,7 @@ class WebSocketClient(BaseCommunicationClient):
|
||||
except (KeyError, AttributeError):
|
||||
logger.warning(f"[WebSocketClient] Failed to remove job {item.job_id} from HostNode status")
|
||||
|
||||
logger.info(f"[WebSocketClient] Intercepting final status for job_id: {item.job_id} - {status}")
|
||||
# logger.debug(f"[WebSocketClient] Intercepting final status for job_id: {item.job_id} - {status}")
|
||||
|
||||
# 通知队列处理器job完成(包括timeout的job)
|
||||
self.queue_processor.handle_job_completed(item.job_id, status)
|
||||
@@ -1444,15 +1539,17 @@ class WebSocketClient(BaseCommunicationClient):
|
||||
# 收集设备信息
|
||||
devices = []
|
||||
machine_name = BasicConfig.machine_name
|
||||
|
||||
|
||||
try:
|
||||
host_node = HostNode.get_instance(0)
|
||||
if host_node:
|
||||
# 获取设备信息
|
||||
for device_id, namespace in host_node.devices_names.items():
|
||||
device_key = f"{namespace}/{device_id}" if namespace.startswith("/") else f"/{namespace}/{device_id}"
|
||||
device_key = (
|
||||
f"{namespace}/{device_id}" if namespace.startswith("/") else f"/{namespace}/{device_id}"
|
||||
)
|
||||
is_online = device_key in host_node._online_devices
|
||||
|
||||
|
||||
# 获取设备的动作信息
|
||||
actions = {}
|
||||
for action_id, client in host_node._action_clients.items():
|
||||
@@ -1463,16 +1560,18 @@ class WebSocketClient(BaseCommunicationClient):
|
||||
"action_path": action_id,
|
||||
"action_type": str(type(client).__name__),
|
||||
}
|
||||
|
||||
devices.append({
|
||||
"device_id": device_id,
|
||||
"namespace": namespace,
|
||||
"device_key": device_key,
|
||||
"is_online": is_online,
|
||||
"machine_name": host_node.device_machine_names.get(device_id, machine_name),
|
||||
"actions": actions,
|
||||
})
|
||||
|
||||
|
||||
devices.append(
|
||||
{
|
||||
"device_id": device_id,
|
||||
"namespace": namespace,
|
||||
"device_key": device_key,
|
||||
"is_online": is_online,
|
||||
"machine_name": host_node.device_machine_names.get(device_id, machine_name),
|
||||
"actions": actions,
|
||||
}
|
||||
)
|
||||
|
||||
logger.info(f"[WebSocketClient] Collected {len(devices)} devices for host_ready")
|
||||
except Exception as e:
|
||||
logger.warning(f"[WebSocketClient] Error collecting device info: {e}")
|
||||
|
||||
Reference in New Issue
Block a user