fast registry load

minor fix on skill & registry

stripe ros2 schema desc
add create-device-skill

new registry system backwards to yaml

remove not exist resource

new registry sys
exp. support with add device

add ai conventions

correct raise create resource error

ret info fix revert

ret info fix

fix prcxi check

add create_resource schema

re signal host ready event

add websocket connection timeout and improve reconnection logic

add open_timeout parameter to websocket connection
add TimeoutError and InvalidStatus exception handling
implement exponential backoff for reconnection attempts
simplify reconnection logic flow

add gzip

change pose extra to any

add isFlapY
This commit is contained in:
Xuwznln
2026-03-04 18:59:45 +08:00
parent 145fcaae65
commit c001f6a151
99 changed files with 10885 additions and 7191 deletions

View File

@@ -4,6 +4,7 @@ import os
import platform
import shutil
import signal
import subprocess
import sys
import threading
import time
@@ -25,6 +26,84 @@ from unilabos.config.config import load_config, BasicConfig, HTTPConfig
_restart_requested: bool = False
_restart_reason: str = ""
RESTART_EXIT_CODE = 42
def _build_child_argv():
"""Build sys.argv for child process, stripping supervisor-only arguments."""
result = []
skip_next = False
for arg in sys.argv:
if skip_next:
skip_next = False
continue
if arg in ("--restart_mode", "--restart-mode"):
continue
if arg in ("--auto_restart_count", "--auto-restart-count"):
skip_next = True
continue
if arg.startswith("--auto_restart_count=") or arg.startswith("--auto-restart-count="):
continue
result.append(arg)
return result
def _run_as_supervisor(max_restarts: int):
"""
Supervisor process that spawns and monitors child processes.
Similar to Uvicorn's --reload: the supervisor itself does no heavy work,
it only launches the real process as a child and restarts it when the child
exits with RESTART_EXIT_CODE.
"""
child_argv = [sys.executable] + _build_child_argv()
restart_count = 0
print_status(
f"[Supervisor] Restart mode enabled (max restarts: {max_restarts}), "
f"child command: {' '.join(child_argv)}",
"info",
)
while True:
print_status(
f"[Supervisor] Launching process (restart {restart_count}/{max_restarts})...",
"info",
)
try:
process = subprocess.Popen(child_argv)
exit_code = process.wait()
except KeyboardInterrupt:
print_status("[Supervisor] Interrupted, terminating child process...", "info")
process.terminate()
try:
process.wait(timeout=10)
except subprocess.TimeoutExpired:
process.kill()
process.wait()
sys.exit(1)
if exit_code == RESTART_EXIT_CODE:
restart_count += 1
if restart_count > max_restarts:
print_status(
f"[Supervisor] Maximum restart count ({max_restarts}) reached, exiting",
"warning",
)
sys.exit(1)
print_status(
f"[Supervisor] Child requested restart ({restart_count}/{max_restarts}), restarting in 2s...",
"info",
)
time.sleep(2)
else:
if exit_code != 0:
print_status(f"[Supervisor] Child exited with code {exit_code}", "warning")
else:
print_status("[Supervisor] Child exited normally", "info")
sys.exit(exit_code)
def load_config_from_file(config_path):
if config_path is None:
@@ -66,6 +145,13 @@ def parse_args():
action="append",
help="Path to the registry directory",
)
parser.add_argument(
"--devices",
type=str,
default=None,
action="append",
help="Path to Python code directory for AST-based device/resource scanning",
)
parser.add_argument(
"--working_dir",
type=str,
@@ -155,18 +241,18 @@ def parse_args():
action="store_true",
help="Skip environment dependency check on startup",
)
parser.add_argument(
"--complete_registry",
action="store_true",
default=False,
help="Complete registry information",
)
parser.add_argument(
"--check_mode",
action="store_true",
default=False,
help="Run in check mode for CI: validates registry imports and ensures no file changes",
)
parser.add_argument(
"--complete_registry",
action="store_true",
default=False,
help="Complete and rewrite YAML registry files using AST analysis results",
)
parser.add_argument(
"--no_update_feedback",
action="store_true",
@@ -178,6 +264,24 @@ def parse_args():
default=False,
help="Test mode: all actions simulate execution and return mock results without running real hardware",
)
parser.add_argument(
"--extra_resource",
action="store_true",
default=False,
help="Load extra lab_ prefixed labware resources (529 auto-generated definitions from lab_resources.py)",
)
parser.add_argument(
"--restart_mode",
action="store_true",
default=False,
help="Enable supervisor mode: automatically restart the process when triggered via WebSocket",
)
parser.add_argument(
"--auto_restart_count",
type=int,
default=500,
help="Maximum number of automatic restarts in restart mode (default: 500)",
)
# workflow upload subcommand
workflow_parser = subparsers.add_parser(
"workflow_upload",
@@ -228,6 +332,11 @@ def main():
args = parser.parse_args()
args_dict = vars(args)
# Supervisor mode: spawn child processes and monitor for restart
if args_dict.get("restart_mode", False):
_run_as_supervisor(args_dict.get("auto_restart_count", 5))
return
# 环境检查 - 检查并自动安装必需的包 (可选)
skip_env_check = args_dict.get("skip_env_check", False)
check_mode = args_dict.get("check_mode", False)
@@ -358,6 +467,9 @@ def main():
BasicConfig.test_mode = args_dict.get("test_mode", False)
if BasicConfig.test_mode:
print_status("启用测试模式:所有动作将模拟执行,不调用真实硬件", "warning")
BasicConfig.extra_resource = args_dict.get("extra_resource", False)
if BasicConfig.extra_resource:
print_status("启用额外资源加载将加载lab_开头的labware资源定义", "info")
BasicConfig.communication_protocol = "websocket"
machine_name = platform.node()
machine_name = "".join([c if c.isalnum() or c == "_" else "_" for c in machine_name])
@@ -382,22 +494,32 @@ def main():
# 显示启动横幅
print_unilab_banner(args_dict)
# 注册表 - check_mode 时强制启用 complete_registry
# Step 0: AST 分析优先 + YAML 注册表加载
# check_mode 和 upload_registry 都会执行实际 import 验证
devices_dirs = args_dict.get("devices", None)
complete_registry = args_dict.get("complete_registry", False) or check_mode
lab_registry = build_registry(args_dict["registry_path"], complete_registry, BasicConfig.upload_registry)
lab_registry = build_registry(
registry_paths=args_dict["registry_path"],
devices_dirs=devices_dirs,
upload_registry=BasicConfig.upload_registry,
check_mode=check_mode,
complete_registry=complete_registry,
)
# Check mode: complete_registry 完成后直接退出git diff 检测由 CI workflow 执行
# Check mode: 注册表验证完成后直接退出
if check_mode:
print_status("Check mode: complete_registry 完成,退出", "info")
device_count = len(lab_registry.device_type_registry)
resource_count = len(lab_registry.resource_type_registry)
print_status(f"Check mode: 注册表验证完成 ({device_count} 设备, {resource_count} 资源),退出", "info")
os._exit(0)
# Step 1: 上传全部注册表到服务端,同步保存到 unilabos_data
if BasicConfig.upload_registry:
# 设备注册到服务端 - 需要 ak 和 sk
if BasicConfig.ak and BasicConfig.sk:
print_status("开始注册设备到服务端...", "info")
# print_status("开始注册设备到服务端...", "info")
try:
register_devices_and_resources(lab_registry)
print_status("设备注册完成", "info")
# print_status("设备注册完成", "info")
except Exception as e:
print_status(f"设备注册失败: {e}", "error")
else:
@@ -482,7 +604,7 @@ def main():
continue
# 如果从远端获取了物料信息,则与本地物料进行同步
if request_startup_json and "nodes" in request_startup_json:
if file_path is not None and request_startup_json and "nodes" in request_startup_json:
print_status("开始同步远端物料到本地...", "info")
remote_tree_set = ResourceTreeSet.from_raw_dict_list(request_startup_json["nodes"])
resource_tree_set.merge_remote_resources(remote_tree_set)
@@ -579,6 +701,10 @@ def main():
open_browser=not args_dict["disable_browser"],
port=BasicConfig.port,
)
if restart_requested:
print_status("[Main] Restart requested, cleaning up...", "info")
cleanup_for_restart()
os._exit(RESTART_EXIT_CODE)
if __name__ == "__main__":

View File

@@ -1,9 +1,8 @@
import json
import time
from typing import Optional, Tuple, Dict, Any
from typing import Any, Dict, Optional, Tuple
from unilabos.utils.log import logger
from unilabos.utils.type_check import TypeEncoder
from unilabos.utils.tools import normalize_json as _normalize_device
def register_devices_and_resources(lab_registry, gather_only=False) -> Optional[Tuple[Dict[str, Any], Dict[str, Any]]]:
@@ -11,50 +10,63 @@ def register_devices_and_resources(lab_registry, gather_only=False) -> Optional[
注册设备和资源到服务器仅支持HTTP
"""
# 注册资源信息 - 使用HTTP方式
from unilabos.app.web.client import http_client
logger.info("[UniLab Register] 开始注册设备和资源...")
# 注册设备信息
devices_to_register = {}
for device_info in lab_registry.obtain_registry_device_info():
devices_to_register[device_info["id"]] = json.loads(
json.dumps(device_info, ensure_ascii=False, cls=TypeEncoder)
)
logger.debug(f"[UniLab Register] 收集设备: {device_info['id']}")
devices_to_register[device_info["id"]] = _normalize_device(device_info)
logger.trace(f"[UniLab Register] 收集设备: {device_info['id']}")
resources_to_register = {}
for resource_info in lab_registry.obtain_registry_resource_info():
resources_to_register[resource_info["id"]] = resource_info
logger.debug(f"[UniLab Register] 收集资源: {resource_info['id']}")
logger.trace(f"[UniLab Register] 收集资源: {resource_info['id']}")
if gather_only:
return devices_to_register, resources_to_register
# 注册设备
if devices_to_register:
try:
start_time = time.time()
response = http_client.resource_registry({"resources": list(devices_to_register.values())})
response = http_client.resource_registry(
{"resources": list(devices_to_register.values())},
tag="device_registry",
)
cost_time = time.time() - start_time
if response.status_code in [200, 201]:
logger.info(f"[UniLab Register] 成功注册 {len(devices_to_register)} 个设备 {cost_time}s")
res_data = response.json() if response.status_code == 200 else {}
skipped = res_data.get("data", {}).get("skipped", False)
if skipped:
logger.info(
f"[UniLab Register] 设备注册跳过(内容未变化)"
f" {len(devices_to_register)}{cost_time:.3f}s"
)
elif response.status_code in [200, 201]:
logger.info(f"[UniLab Register] 成功注册 {len(devices_to_register)} 个设备 {cost_time:.3f}s")
else:
logger.error(f"[UniLab Register] 设备注册失败: {response.status_code}, {response.text} {cost_time}s")
logger.error(f"[UniLab Register] 设备注册失败: {response.status_code}, {response.text} {cost_time:.3f}s")
except Exception as e:
logger.error(f"[UniLab Register] 设备注册异常: {e}")
# 注册资源
if resources_to_register:
try:
start_time = time.time()
response = http_client.resource_registry({"resources": list(resources_to_register.values())})
response = http_client.resource_registry(
{"resources": list(resources_to_register.values())},
tag="resource_registry",
)
cost_time = time.time() - start_time
if response.status_code in [200, 201]:
logger.info(f"[UniLab Register] 成功注册 {len(resources_to_register)} 个资源 {cost_time}s")
res_data = response.json() if response.status_code == 200 else {}
skipped = res_data.get("data", {}).get("skipped", False)
if skipped:
logger.info(
f"[UniLab Register] 资源注册跳过(内容未变化)"
f" {len(resources_to_register)}{cost_time:.3f}s"
)
elif response.status_code in [200, 201]:
logger.info(f"[UniLab Register] 成功注册 {len(resources_to_register)} 个资源 {cost_time:.3f}s")
else:
logger.error(f"[UniLab Register] 资源注册失败: {response.status_code}, {response.text} {cost_time}s")
logger.error(f"[UniLab Register] 资源注册失败: {response.status_code}, {response.text} {cost_time:.3f}s")
except Exception as e:
logger.error(f"[UniLab Register] 资源注册异常: {e}")
logger.info("[UniLab Register] 设备和资源注册完成.")

View File

@@ -1052,7 +1052,7 @@ async def handle_file_import(websocket: WebSocket, request_data: dict):
"result": {},
"schema": lab_registry._generate_unilab_json_command_schema(v["args"], k),
"goal_default": {i["name"]: i["default"] for i in v["args"]},
"handles": [],
"handles": {},
}
# 不生成已配置action的动作
for k, v in enhanced_info["action_methods"].items()
@@ -1340,5 +1340,5 @@ def setup_api_routes(app):
# 启动广播任务
@app.on_event("startup")
async def startup_event():
asyncio.create_task(broadcast_device_status())
asyncio.create_task(broadcast_status_page_data())
asyncio.create_task(broadcast_device_status(), name="web-api-startup-device")
asyncio.create_task(broadcast_status_page_data(), name="web-api-startup-status")

View File

@@ -3,11 +3,13 @@ HTTP客户端模块
提供与远程服务器通信的客户端功能只有host需要用
"""
import gzip
import json
import os
from typing import List, Dict, Any, Optional
from unilabos.utils.tools import fast_dumps as _fast_dumps, fast_dumps_pretty as _fast_dumps_pretty
import requests
from unilabos.resources.resource_tracker import ResourceTreeSet
from unilabos.utils.log import info
@@ -280,22 +282,54 @@ class HTTPClient:
)
return response
def resource_registry(self, registry_data: Dict[str, Any] | List[Dict[str, Any]]) -> requests.Response:
def resource_registry(
self, registry_data: Dict[str, Any] | List[Dict[str, Any]], tag: str = "registry",
) -> requests.Response:
"""
注册资源到服务器
注册资源到服务器,同步保存请求/响应到 unilabos_data
Args:
registry_data: 注册表数据,格式为 {resource_id: resource_info} / [{resource_info}]
tag: 保存文件的标签后缀 (如 "device_registry" / "resource_registry")
Returns:
Response: API响应对象
"""
# 序列化一次,同时用于保存和发送
json_bytes = _fast_dumps(registry_data)
# 保存请求数据到 unilabos_data
req_path = os.path.join(BasicConfig.working_dir, f"req_{tag}_upload.json")
try:
os.makedirs(BasicConfig.working_dir, exist_ok=True)
with open(req_path, "wb") as f:
f.write(_fast_dumps_pretty(registry_data))
logger.trace(f"注册表请求数据已保存: {req_path}")
except Exception as e:
logger.warning(f"保存注册表请求数据失败: {e}")
compressed_body = gzip.compress(json_bytes)
headers = {
"Authorization": f"Lab {self.auth}",
"Content-Type": "application/json",
"Content-Encoding": "gzip",
}
response = requests.post(
f"{self.remote_addr}/lab/resource",
json=registry_data,
headers={"Authorization": f"Lab {self.auth}"},
data=compressed_body,
headers=headers,
timeout=30,
)
# 保存响应数据到 unilabos_data
res_path = os.path.join(BasicConfig.working_dir, f"res_{tag}_upload.json")
try:
with open(res_path, "w", encoding="utf-8") as f:
f.write(f"{response.status_code}\n{response.text}")
logger.trace(f"注册表响应数据已保存: {res_path}")
except Exception as e:
logger.warning(f"保存注册表响应数据失败: {e}")
if response.status_code not in [200, 201]:
logger.error(f"注册资源失败: {response.status_code}, {response.text}")
if response.status_code == 200:

View File

@@ -86,7 +86,7 @@ def setup_server() -> FastAPI:
# 设置页面路由
try:
setup_web_pages(pages)
info("[Web] 已加载Web UI模块")
# info("[Web] 已加载Web UI模块")
except ImportError as e:
info(f"[Web] 未找到Web页面模块: {str(e)}")
except Exception as e:
@@ -138,7 +138,7 @@ def start_server(host: str = "0.0.0.0", port: int = 8002, open_browser: bool = T
server_thread = threading.Thread(target=server.run, daemon=True, name="uvicorn_server")
server_thread.start()
info("[Web] Server started, monitoring for restart requests...")
# info("[Web] Server started, monitoring for restart requests...")
# 监控重启标志
import unilabos.app.main as main_module

View File

@@ -26,6 +26,7 @@ from enum import Enum
from typing_extensions import TypedDict
from unilabos.app.model import JobAddReq
from unilabos.resources.resource_tracker import ResourceDictType
from unilabos.ros.nodes.presets.host_node import HostNode
from unilabos.utils.type_check import serialize_result_info
from unilabos.app.communication import BaseCommunicationClient
@@ -408,6 +409,7 @@ class MessageProcessor:
# 线程控制
self.is_running = False
self.thread = None
self._loop = None # asyncio event loop引用用于外部关闭websocket
self.reconnect_count = 0
logger.info(f"[MessageProcessor] Initialized for URL: {websocket_url}")
@@ -434,22 +436,31 @@ class MessageProcessor:
def stop(self) -> None:
"""停止消息处理线程"""
self.is_running = False
# 主动关闭websocket以快速中断消息接收循环
ws = self.websocket
loop = self._loop
if ws and loop and loop.is_running():
try:
asyncio.run_coroutine_threadsafe(ws.close(), loop)
except Exception:
pass
if self.thread and self.thread.is_alive():
self.thread.join(timeout=2)
logger.info("[MessageProcessor] Stopped")
def _run(self):
"""运行消息处理主循环"""
loop = asyncio.new_event_loop()
self._loop = asyncio.new_event_loop()
try:
asyncio.set_event_loop(loop)
loop.run_until_complete(self._connection_handler())
asyncio.set_event_loop(self._loop)
self._loop.run_until_complete(self._connection_handler())
except Exception as e:
logger.error(f"[MessageProcessor] Thread error: {str(e)}")
logger.error(traceback.format_exc())
finally:
if loop:
loop.close()
if self._loop:
self._loop.close()
self._loop = None
async def _connection_handler(self):
"""处理WebSocket连接和重连逻辑"""
@@ -466,8 +477,10 @@ class MessageProcessor:
async with websockets.connect(
self.websocket_url,
ssl=ssl_context,
open_timeout=20,
ping_interval=WSConfig.ping_interval,
ping_timeout=10,
close_timeout=5,
additional_headers={
"Authorization": f"Lab {BasicConfig.auth_secret()}",
"EdgeSession": f"{self.session_id}",
@@ -478,85 +491,98 @@ class MessageProcessor:
self.connected = True
self.reconnect_count = 0
logger.info(f"[MessageProcessor] Connected to {self.websocket_url}")
logger.info(f"[MessageProcessor] 已连接到 {self.websocket_url}")
# 启动发送协程
send_task = asyncio.create_task(self._send_handler())
send_task = asyncio.create_task(self._send_handler(), name="websocket-send_task")
# 每次连接(含重连)后重新向服务端注册,
# 否则服务端不知道客户端已上线,不会推送消息。
if self.websocket_client:
self.websocket_client.publish_host_ready()
try:
# 接收消息循环
await self._message_handler()
finally:
# 必须在 async with __aexit__ 之前停止 send_task
# 否则 send_task 会在关闭握手期间继续发送数据,
# 干扰 websockets 库的内部清理,导致 task 泄漏。
self.connected = False
send_task.cancel()
try:
await send_task
except asyncio.CancelledError:
pass
self.connected = False
except websockets.exceptions.ConnectionClosed:
logger.warning("[MessageProcessor] Connection closed")
self.connected = False
logger.warning("[MessageProcessor] 与服务端连接中断")
except TimeoutError:
logger.warning(
f"[MessageProcessor] 与服务端连接通信超时 (已尝试 {self.reconnect_count + 1} 次),请检查您的网络状况"
)
except websockets.exceptions.InvalidStatus as e:
logger.warning(
f"[MessageProcessor] 收到服务端注册码 {e.response.status_code}, 上一进程可能还未退出"
)
except Exception as e:
logger.error(f"[MessageProcessor] Connection error: {str(e)}")
logger.error(traceback.format_exc())
self.connected = False
logger.error(f"[MessageProcessor] 尝试重连时出错 {str(e)}")
finally:
self.connected = False
self.websocket = None
# 重连逻辑
if self.is_running and self.reconnect_count < WSConfig.max_reconnect_attempts:
if not self.is_running:
break
if self.reconnect_count < WSConfig.max_reconnect_attempts:
self.reconnect_count += 1
backoff = WSConfig.reconnect_interval
logger.info(
f"[MessageProcessor] Reconnecting in {WSConfig.reconnect_interval}s "
f"(attempt {self.reconnect_count}/{WSConfig.max_reconnect_attempts})"
f"[MessageProcessor] 即将在 {backoff} 秒后重连 (已尝试 {self.reconnect_count}/{WSConfig.max_reconnect_attempts})"
)
await asyncio.sleep(WSConfig.reconnect_interval)
elif self.reconnect_count >= WSConfig.max_reconnect_attempts:
await asyncio.sleep(backoff)
else:
logger.error("[MessageProcessor] Max reconnection attempts reached")
break
else:
self.reconnect_count -= 1
async def _message_handler(self):
"""处理接收到的消息"""
"""处理接收到的消息
ConnectionClosed 不在此处捕获,让其向上传播到 _connection_handler
以便 async with websockets.connect() 的 __aexit__ 能感知连接已断,
正确清理内部 task避免 task 泄漏。
"""
if not self.websocket:
logger.error("[MessageProcessor] WebSocket connection is None")
return
try:
async for message in self.websocket:
try:
data = json.loads(message)
message_type = data.get("action", "")
message_data = data.get("data")
if self.session_id and self.session_id == data.get("edge_session"):
await self._process_message(message_type, message_data)
async for message in self.websocket:
try:
data = json.loads(message)
message_type = data.get("action", "")
message_data = data.get("data")
if self.session_id and self.session_id == data.get("edge_session"):
await self._process_message(message_type, message_data)
else:
if message_type.endswith("_material"):
logger.trace(
f"[MessageProcessor] 收到一条归属 {data.get('edge_session')} 的旧消息:{data}"
)
logger.debug(
f"[MessageProcessor] 跳过了一条归属 {data.get('edge_session')} 的旧消息: {data.get('action')}"
)
else:
if message_type.endswith("_material"):
logger.trace(
f"[MessageProcessor] 收到一条归属 {data.get('edge_session')} 的旧消息:{data}"
)
logger.debug(
f"[MessageProcessor] 跳过了一条归属 {data.get('edge_session')} 的旧消息: {data.get('action')}"
)
else:
await self._process_message(message_type, message_data)
except json.JSONDecodeError:
logger.error(f"[MessageProcessor] Invalid JSON received: {message}")
except Exception as e:
logger.error(f"[MessageProcessor] Error processing message: {str(e)}")
logger.error(traceback.format_exc())
except websockets.exceptions.ConnectionClosed:
logger.info("[MessageProcessor] Message handler stopped - connection closed")
except Exception as e:
logger.error(f"[MessageProcessor] Message handler error: {str(e)}")
logger.error(traceback.format_exc())
await self._process_message(message_type, message_data)
except json.JSONDecodeError:
logger.error(f"[MessageProcessor] Invalid JSON received: {message}")
except Exception as e:
logger.error(f"[MessageProcessor] Error processing message: {str(e)}")
logger.error(traceback.format_exc())
async def _send_handler(self):
"""处理发送队列中的消息"""
logger.debug("[MessageProcessor] Send handler started")
logger.trace("[MessageProcessor] Send handler started")
try:
while self.connected and self.websocket:
@@ -601,6 +627,7 @@ class MessageProcessor:
except asyncio.CancelledError:
logger.debug("[MessageProcessor] Send handler cancelled")
raise
except Exception as e:
logger.error(f"[MessageProcessor] Fatal error in send handler: {str(e)}")
logger.error(traceback.format_exc())
@@ -632,6 +659,10 @@ class MessageProcessor:
# elif message_type == "session_id":
# self.session_id = message_data.get("session_id")
# logger.info(f"[MessageProcessor] Session ID: {self.session_id}")
elif message_type == "add_device":
await self._handle_device_manage(message_data, "add")
elif message_type == "remove_device":
await self._handle_device_manage(message_data, "remove")
elif message_type == "request_restart":
await self._handle_request_restart(message_data)
else:
@@ -968,6 +999,37 @@ class MessageProcessor:
)
thread.start()
async def _handle_device_manage(self, device_list: list[ResourceDictType], action: str):
"""Handle add_device / remove_device from LabGo server."""
if not device_list:
return
for item in device_list:
target_node_id = item.get("target_node_id", "host_node")
def _notify(target_id: str, act: str, cfg: ResourceDictType):
try:
host_node = HostNode.get_instance(timeout=5)
if not host_node:
logger.error(f"[DeviceManage] HostNode not available for {act}_device")
return
success = host_node.notify_device_manage(target_id, act, cfg)
if success:
logger.info(f"[DeviceManage] {act}_device completed on {target_id}")
else:
logger.warning(f"[DeviceManage] {act}_device failed on {target_id}")
except Exception as e:
logger.error(f"[DeviceManage] Error in {act}_device: {e}")
logger.error(traceback.format_exc())
thread = threading.Thread(
target=_notify,
args=(target_node_id, action, item),
daemon=True,
name=f"DeviceManage-{action}-{item.get('id', '')}",
)
thread.start()
async def _handle_request_restart(self, data: Dict[str, Any]):
"""
处理重启请求
@@ -979,10 +1041,9 @@ class MessageProcessor:
logger.info(f"[MessageProcessor] Received restart request, reason: {reason}, delay: {delay}s")
# 发送确认消息
if self.websocket_client:
await self.websocket_client.send_message(
{"action": "restart_acknowledged", "data": {"reason": reason, "delay": delay}}
)
self.send_message(
{"action": "restart_acknowledged", "data": {"reason": reason, "delay": delay}}
)
# 设置全局重启标志
import unilabos.app.main as main_module
@@ -1084,13 +1145,14 @@ class QueueProcessor:
def stop(self) -> None:
"""停止队列处理线程"""
self.is_running = False
self.queue_update_event.set() # 立即唤醒等待中的线程
if self.thread and self.thread.is_alive():
self.thread.join(timeout=2)
logger.info("[QueueProcessor] Stopped")
def _run(self):
"""运行队列处理主循环"""
logger.debug("[QueueProcessor] Queue processor started")
logger.trace("[QueueProcessor] Queue processor started")
while self.is_running:
try:
@@ -1305,7 +1367,6 @@ class WebSocketClient(BaseCommunicationClient):
else:
url = f"{scheme}://{parsed.netloc}/api/v1/ws/schedule"
logger.debug(f"[WebSocketClient] URL: {url}")
return url
def start(self) -> None:
@@ -1318,13 +1379,11 @@ class WebSocketClient(BaseCommunicationClient):
logger.error("[WebSocketClient] WebSocket URL not configured")
return
logger.info(f"[WebSocketClient] Starting connection to {self.websocket_url}")
# 启动两个核心线程
self.message_processor.start()
self.queue_processor.start()
logger.info("[WebSocketClient] All threads started")
logger.trace("[WebSocketClient] All threads started")
def stop(self) -> None:
"""停止WebSocket客户端"""
@@ -1340,8 +1399,8 @@ class WebSocketClient(BaseCommunicationClient):
message = {"action": "normal_exit", "data": {"session_id": session_id}}
self.message_processor.send_message(message)
logger.info(f"[WebSocketClient] Sent normal_exit message with session_id: {session_id}")
# 给一点时间让消息发送出去
time.sleep(1)
# send_handler 每100ms检查一次队列等300ms足以让消息发
time.sleep(0.3)
except Exception as e:
logger.warning(f"[WebSocketClient] Failed to send normal_exit message: {str(e)}")