feat: first commit

2025-10-08 20:39:09 +08:00
commit 80f0e7f8d7
82 changed files with 12216 additions and 0 deletions
--- a/src/dag_hmm_classifier.py
+++ b/src/dag_hmm_classifier.py
@@ -0,0 +1,745 @@
+"""
+优化版DAG-HMM分类器模块 - 基于米兰大学论文Algorithm 1的改进实现
+
+主要修复：
+1. 添加转移矩阵验证和修复方法
+2. 改进HMM参数设置
+3. 增强错误处理机制
+4. 优化特征处理流程
+5. 修复意图分类分数异常问题：为每个意图训练独立的HMM模型，并使用softmax进行概率归一化。
+"""
+
+import os
+import numpy as np
+import json
+import pickle
+from typing import Dict, Any, List, Optional, Tuple
+from hmmlearn import hmm
+import networkx as nx
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+from itertools import combinations
+import warnings
+warnings.filterwarnings("ignore")
+
+class DAGHMMClassifier:
+    """
+    修复版DAG-HMM分类器 - 解决转移矩阵问题和意图分类分数问题
+
+    主要修复：
+    - HMM转移矩阵零行问题
+    - 参数设置优化
+    - 错误处理增强
+    - 为每个意图训练独立的HMM模型，并使用softmax进行概率归一化
+    """
+
+    def __init__(self,
+                 max_states: int = 1,
+                 max_gaussians: int = 1,
+                 covariance_type: str = "diag",
+                 n_iter: int = 100,
+                 random_state: int = 42,
+                 cv_folds: int = 5):
+        """
+        初始化修复版DAG-HMM分类器
+
+        参数:
+            max_states: 最大隐状态数量（减少以避免稀疏问题）
+            max_gaussians: 最大高斯混合成分数（减少以避免过拟合）
+            covariance_type: 协方差类型（使用diag避免参数过多）
+            n_iter: 训练迭代次数（减少以避免过拟合）
+            random_state: 随机种子
+            cv_folds: 交叉验证折数
+        """
+        self.max_states = self._validate_positive_integer(max_states, "max_states")
+        self.max_gaussians = self._validate_positive_integer(max_gaussians, "max_gaussians")
+        self.covariance_type = covariance_type
+        self.n_iter = n_iter
+        self.random_state = random_state
+        self.cv_folds = cv_folds
+
+        # 模型组件
+        self.intent_models = {}  # 存储每个意图的独立HMM模型
+        self.class_names = []
+        self.label_encoder = None
+        self.scaler = StandardScaler()
+
+        print("✅ 修复版DAG-HMM分类器已初始化（对数似然修复版）")
+
+    def _validate_positive_integer(self, value: Any, param_name: str) -> int:
+        """验证并转换为正整数"""
+        try:
+            int_value = int(value)
+            if int_value <= 0:
+                raise ValueError(f"{param_name} 必须是正整数，得到: {int_value}")
+            return int_value
+        except (ValueError, TypeError) as e:
+            raise ValueError(f"无法将 {param_name} 转换为正整数: {value}, 错误: {e}")
+
+    def _fix_transition_matrix(self, model, model_name="HMM"):
+        """
+        修复HMM转移矩阵中的零行问题
+
+        参数:
+            model: HMM模型
+            model_name: 模型名称（用于日志）
+
+        返回:
+            修复后的模型
+        """
+        try:
+            # 检查转移矩阵
+            transmat = model.transmat_
+
+            # 如果模型状态数 n_components 为 0，直接返回模型，避免除以零的错误
+            if model.n_components == 0:
+                print(f"⚠️ {model_name}: 模型状态数 n_components 为 0，无法修复转移矩阵。")
+                return model
+
+            # 找到和为0的行
+            row_sums = np.sum(transmat, axis=1)
+            zero_rows = np.where(np.abs(row_sums) < 1e-10)[0]  # 使用小阈值检测零行
+
+            if len(zero_rows) > 0:
+                print(f"🔧 {model_name}: 发现 {len(zero_rows)} 个零和行，正在修复...")
+                n_states = transmat.shape[1]
+
+                for row_idx in zero_rows:
+                    # 尝试均匀分布，或者设置一个小的非零值
+                    # 确保即使 n_states 为 0 也不会出错
+                    if n_states > 0:
+                        transmat[row_idx, :] = 1.0 / n_states
+                    else:
+                        # 如果状态数为0，这不应该发生，但作为极端情况处理
+                        transmat[row_idx, :] = 0.0 # 无法有效修复
+
+                # 在归一化之前，为每一行添加一个小的 epsilon，防止出现全零行
+                epsilon = 1e-10
+                transmat += epsilon
+
+                # 确保所有行和为1，并处理可能出现的NaN或inf
+                for i in range(transmat.shape[0]):
+                    row_sum = np.sum(transmat[i, :])
+                    if row_sum > 0 and not np.isnan(row_sum) and not np.isinf(row_sum):
+                        transmat[i, :] /= row_sum
+                    else:
+                        # 如果行和为0或NaN/inf，则设置为均匀分布
+                        if transmat.shape[1] > 0:
+                            transmat[i, :] = 1.0 / transmat.shape[1]
+                        else:
+                            transmat[i, :] = 0.0
+
+                model.transmat_ = transmat
+                print(f"✅ {model_name}: 转移矩阵修复完成")
+
+            # 验证修复结果
+            final_row_sums = np.sum(model.transmat_, axis=1)
+            if not np.allclose(final_row_sums, 1.0, atol=1e-6):
+                print(f"⚠️ {model_name}: 转移矩阵行和验证失败: {final_row_sums}")
+                # 强制归一化，再次处理可能出现的NaN或inf
+                for i in range(model.transmat_.shape[0]):
+                    row_sum = np.sum(model.transmat_[i, :])
+                    if row_sum > 0 and not np.isnan(row_sum) and not np.isinf(row_sum):
+                        model.transmat_[i, :] /= row_sum
+                    else:
+                        if model.transmat_.shape[1] > 0:
+                            model.transmat_[i, :] = 1.0 / model.transmat_.shape[1]
+                        else:
+                            model.transmat_[i, :] = 0.0
+                print(f"🔧 {model_name}: 强制归一化完成")
+
+            return model
+
+        except Exception as e:
+            print(f"❌ {model_name}: 转移矩阵修复失败: {e}")
+            return model
+
+    def _fix_startprob(self, model, model_name="HMM"):
+        """
+        修复HMM初始概率中的NaN或零和问题
+
+        参数:
+            model: HMM模型
+            model_name: 模型名称（用于日志）
+
+        返回:
+            修复后的模型
+        """
+        try:
+            startprob = model.startprob_
+
+            # 检查是否存在NaN或inf
+            if np.any(np.isnan(startprob)) or np.any(np.isinf(startprob)):
+                print(f"🔧 {model_name}: 发现初始概率包含NaN或inf，正在修复...")
+                # 重新初始化为均匀分布
+                model.startprob_ = np.full(model.n_components, 1.0 / model.n_components)
+                print(f"✅ {model_name}: 初始概率修复完成（均匀分布）。")
+                return model
+
+            # 检查和是否为1
+            startprob_sum = np.sum(startprob)
+            if not np.allclose(startprob_sum, 1.0, atol=1e-6):
+                print(f"🔧 {model_name}: 初始概率和不为1 ({startprob_sum})，正在修复...")
+                if startprob_sum > 0:
+                    model.startprob_ = startprob / startprob_sum
+                else:
+                    # 如果和为0，则重新初始化为均匀分布
+                    model.startprob_ = np.full(model.n_components, 1.0 / model.n_components)
+                print(f"✅ {model_name}: 初始概率修复完成（归一化）。")
+
+            return model
+
+        except Exception as e:
+            print(f"❌ {model_name}: 初始概率修复失败: {e}")
+            return model
+
+    def _validate_hmm_model(self, model, model_name="HMM"):
+        """
+        验证HMM模型的有效性
+
+        参数:
+            model: HMM模型
+            model_name: 模型名称
+
+        返回:
+            是否有效
+        """
+        try:
+            # 检查转移矩阵
+            if hasattr(model, 'transmat_'):
+                transmat = model.transmat_
+                row_sums = np.sum(transmat, axis=1)
+
+                # 检查是否有零行
+                if np.any(np.abs(row_sums) < 1e-10):
+                    print(f"⚠️ {model_name}: 转移矩阵存在零行")
+                    return False
+
+                # 检查行和是否为1
+                if not np.allclose(row_sums, 1.0, atol=1e-6):
+                    print(f"⚠️ {model_name}: 转移矩阵行和不为1: {row_sums}")
+                    return False
+
+            # 检查起始概率
+            if hasattr(model, 'startprob_'):
+                startprob_sum = np.sum(model.startprob_)
+                if not np.allclose(startprob_sum, 1.0, atol=1e-6):
+                    print(f"⚠️ {model_name}: 起始概率和不为1: {startprob_sum}")
+                    return False
+
+            return True
+
+        except Exception as e:
+            print(f"❌ {model_name}: 模型验证失败: {e}")
+            return False
+
+    def _create_robust_hmm_model(self, n_states, n_gaussians, random_state=None):
+        """
+        创建鲁棒的HMM模型
+
+        参数:
+            n_states: 状态数
+            n_gaussians: 高斯数
+            random_state: 随机种子
+
+        返回:
+            HMM模型
+        """
+        if random_state is None:
+            random_state = self.random_state
+
+        # 确保参数合理
+        n_states = 1  # 限制状态数
+        n_gaussians = 1  # 高斯数不超过状态数
+
+        model = hmm.GMMHMM(
+            n_components=n_states,
+            n_mix=n_gaussians,
+            covariance_type=self.covariance_type,
+            n_iter=self.n_iter,
+            random_state=random_state,
+            tol=1e-2,
+            min_covar=1e-2,
+            init_params='stmc',
+            params='stmc'
+        )
+        print(f"创建HMM模型: 状态数={n_states}, 高斯数={n_gaussians}, 迭代={self.n_iter}")
+
+        return model
+
+
+    def _normalize_feature_dimensions(self, feature_vectors: List) -> Tuple[np.ndarray, List[int]]:
+        """
+        标准化特征维度（修复版，保留时间维度）
+
+        返回:
+            normalized_array: 标准化后的三维数组 (n_samples, n_timesteps, n_features)
+            lengths: 每个样本的有效长度列表
+        """
+        if not feature_vectors:
+            return np.array([]), []
+
+        processed_features = []
+        lengths = []
+
+        # 第一步：统一格式并提取所有特征用于拟合标准化器
+        all_features = []  # 收集所有特征用于计算均值和方差
+        for features in feature_vectors:
+            if isinstance(features, dict):
+                # 处理字典格式特征（时间步为键）
+                time_steps = sorted([int(k) for k in features.keys() if k.isdigit()])
+                if time_steps:
+                    feature_sequence = []
+                    for t in time_steps:
+                        step_features = features[str(t)]
+                        if isinstance(step_features, (list, np.ndarray)):
+                            step_array = np.array(step_features).flatten()
+                            feature_sequence.append(step_array)
+                            all_features.append(step_array)  # 收集用于标准化
+
+                    if feature_sequence:
+                        processed_features.append(np.array(feature_sequence))
+                        lengths.append(len(feature_sequence))
+                    else:
+                        # 空序列处理
+                        processed_features.append(np.array([[0.0]]))
+                        lengths.append(1)
+                        all_features.append(np.array([0.0]))
+                else:
+                    # 没有时间步信息，当作单步处理
+                    feature_array = np.array(list(features.values())).flatten()
+                    processed_features.append(feature_array.reshape(1, -1))
+                    lengths.append(1)
+                    all_features.append(feature_array)
+
+            elif isinstance(features, (list, np.ndarray)):
+                feature_array = np.array(features)
+                if feature_array.ndim == 1:
+                    # 一维特征，当作单时间步
+                    processed_features.append(feature_array.reshape(1, -1))
+                    lengths.append(1)
+                    all_features.append(feature_array)
+                elif feature_array.ndim == 2:
+                    # 二维特征，假设是 (time_steps, features)
+                    processed_features.append(feature_array)
+                    lengths.append(feature_array.shape[0])
+                    for t in range(feature_array.shape[0]):
+                        all_features.append(feature_array[t])
+                else:
+                    # 高维特征，展平处理
+                    flattened = feature_array.flatten()
+                    processed_features.append(flattened.reshape(1, -1))
+                    lengths.append(1)
+                    all_features.append(flattened)
+            else:
+                # 其他类型，尝试转换
+                try:
+                    feature_array = np.array([features]).flatten()
+                    processed_features.append(feature_array.reshape(1, -1))
+                    lengths.append(1)
+                    all_features.append(feature_array)
+                except:
+                    # 转换失败，使用零向量
+                    processed_features.append(np.array([[0.0]]))
+                    lengths.append(1)
+                    all_features.append(np.array([0.0]))
+
+        if not processed_features:
+            return np.array([]), []
+
+        # 第二步：确定统一的特征维度
+        feature_dims = [f.shape[1] for f in processed_features]
+        unique_dims = list(set(feature_dims))
+
+        if len(unique_dims) > 1:
+            # 特征维度不一致，需要统一
+            target_dim = max(set(feature_dims), key=feature_dims.count)  # 使用最常见的维度
+            print(f"🔧 特征维度分布: {set(feature_dims)}, 目标维度: {target_dim}")
+
+            # 统一特征维度
+            unified_features = []
+            for features in processed_features:
+                current_dim = features.shape[1]
+                if current_dim < target_dim:
+                    # 填充
+                    padding_size = target_dim - current_dim
+                    padding = np.zeros((features.shape[0], padding_size))
+                    unified_features.append(np.concatenate([features, padding], axis=1))
+                elif current_dim > target_dim:
+                    # 截断
+                    unified_features.append(features[:, :target_dim])
+                else:
+                    unified_features.append(features)
+            processed_features = unified_features
+
+        # 第三步：统一时间步长度
+        max_length = max(lengths)
+        min_length = min(lengths)
+
+        if max_length != min_length:
+            # 时间步长度不一致，需要填充
+            target_length = min(max_length, 50)  # 限制最大长度避免内存问题
+
+            padded_features = []
+            adjusted_lengths = []
+
+            for i, features in enumerate(processed_features):
+                current_length = lengths[i]
+                if current_length < target_length:
+                    # 填充时间步
+                    padding_steps = target_length - current_length
+                    if current_length > 0:
+                        # 使用最后一个时间步的值进行填充
+                        last_step = features[-1:].repeat(padding_steps, axis=0)
+                        padded_features.append(np.concatenate([features, last_step], axis=0))
+                    else:
+                        # 如果原序列为空，用零填充
+                        zero_padding = np.zeros((target_length, features.shape[1]))
+                        padded_features.append(zero_padding)
+                    adjusted_lengths.append(target_length)
+                elif current_length > target_length:
+                    # 截断时间步
+                    padded_features.append(features[:target_length])
+                    adjusted_lengths.append(target_length)
+                else:
+                    padded_features.append(features)
+                    adjusted_lengths.append(current_length)
+
+            processed_features = padded_features
+            lengths = adjusted_lengths
+
+        # 第四步：转换为三维数组并标准化
+        if processed_features:
+            dims = [f.shape[1] for f in processed_features]
+            print(f"特征维度分布: {dims}, 平均维度: {np.mean(dims):.1f}")
+
+            # 堆叠为三维数组
+            X = np.array(processed_features)  # (n_samples, n_timesteps, n_features)
+            X_flat = X.reshape(-1, X.shape[-1])
+            # 检查 X_flat 是否为空，以及是否存在非零标准差的特征
+            if X_flat.shape[0] > 0 and np.any(np.std(X_flat, axis=0) > 1e-8):
+                self.scaler.fit(X_flat)
+                normalized_X_flat = self.scaler.transform(X_flat)
+                normalized_X = normalized_X_flat.reshape(X.shape)
+            else:
+                # 如果所有特征的标准差都为零，或者 X_flat 为空，则不进行标准化
+                normalized_X = X
+            return normalized_X, lengths
+        return np.array([]), []
+
+    def fit(self, features_list: List[np.ndarray], labels: List[str]) -> Dict[str, Any]:
+        """
+        训练DAG-HMM分类器
+
+        参数:
+            features_list: 特征列表
+            labels: 标签列表
+
+        返回:
+            训练指标字典
+        """
+        print("🚀 开始训练修复版DAG-HMM分类器...")
+        print(f"样本数量: {len(features_list)}")
+        print(f"类别数量: {len(set(labels))}")
+
+        # 编码标签
+        self.label_encoder = LabelEncoder()
+        encoded_labels = self.label_encoder.fit_transform(labels)
+        self.class_names = list(self.label_encoder.classes_)
+
+        print("📋 类别名称:", self.class_names)
+        for i, class_name in enumerate(self.class_names):
+            count = np.sum(np.array(labels) == class_name)
+            print(f"📈 类别 \'{class_name}\' : {count} 个样本")
+
+        # 按类别组织特征
+        features_by_class = {}
+        for class_name in self.class_names:
+            class_indices = [i for i, label in enumerate(labels) if label == class_name]
+            features_by_class[class_name] = [features_list[i] for i in class_indices]
+
+        # 为每个意图训练一个独立的HMM模型
+        self.intent_models = {}
+        for class_name, class_features in features_by_class.items():
+            print(f"🎯 训练意图 \'{class_name}\' 的HMM模型...")
+
+            class_indices = np.where(encoded_labels == self.label_encoder.transform([class_name])[0])[0]
+            class_features = [features_list[i] for i in class_indices]
+            if len(class_features) == 0:
+                print(f"⚠️ 意图 '{class_name}' 没有训练样本，跳过。")
+                continue
+
+            cleaned_features = []
+            for features in class_features:
+                # 检查并清理异常值
+                if np.any(np.isnan(features)) or np.any(np.isinf(features)):
+                    print(f"⚠️ 发现异常特征值，正在清理...")
+                    features = np.nan_to_num(features, nan=0.0, posinf=1e6, neginf=-1e6)
+
+                # 确保特征值在合理范围内
+                features = np.clip(features, -1e6, 1e6)
+                cleaned_features.append(features)
+                # 转换为HMM训练格式
+            X_class = np.vstack(cleaned_features)
+            lengths_class = [len(f) for f in cleaned_features]
+            if np.any(np.isnan(X_class)) or np.any(np.isinf(X_class)):
+                print(f"❌ 意图 '{class_name}' 合并后仍有异常值")
+                continue
+
+            # X, lengths = self._normalize_feature_dimensions(class_features)
+            #
+            # if X.size == 0:
+            #     print(f"⚠️ 意图 \'{class_name}\' 没有有效特征，跳过训练。")
+            #     continue
+
+            # n_features = X.shape[2]
+            model = self._create_robust_hmm_model(self.max_states, self.max_gaussians, self.random_state)
+
+            # 将三维特征数据 (n_samples, n_timesteps, n_features) 转换为二维 (total_observations, n_features)
+            # 并确保 lengths 参数正确传递
+            # X_reshaped = X.reshape(-1, n_features)
+            model.fit(X_class, lengths_class)
+            # 在模型训练成功后，修复转移矩阵和初始概率
+            if hasattr(model, 'covars_'):
+                for i, covar in enumerate(model.covars_):
+                    if np.any(np.isnan(covar)) or np.any(np.isinf(covar)):
+                        print(f"❌ 意图 '{class_name}' 状态 {i} 协方差包含异常值")
+                        # 强制修复协方差矩阵
+                        if self.covariance_type == "diag":
+                            covar[np.isnan(covar)] = 1e-3
+                            covar[np.isinf(covar)] = 1e-3
+                            covar[covar <= 0] = 1e-3
+                        model.covars_[i] = covar
+            model = self._fix_transition_matrix(model, model_name=f"训练后的 {class_name} 模型")
+            model = self._fix_startprob(model, model_name=f"训练后的 {class_name} 模型")
+            self.intent_models[class_name] = model
+            print(f"✅ 意图 \'{class_name}\' HMM模型训练完成。")
+
+        print("🎉 训练完成!")
+        return {
+                "train_accuracy": 0.0,
+                "n_classes": len(self.class_names),
+                "classes": self.class_names,
+                "n_samples": len(features_list),
+                # "n_binary_tasks": len(self.dag_topology),
+                # "task_difficulties": self.task_difficulties
+            }
+
+    def predict(self, features: np.ndarray, species) -> Dict[str, Any]:
+        """
+        预测音频的意图
+
+        参数:
+            features: 提取的特征
+            species: 物种
+
+        返回:
+            result: 预测结果
+        """
+        if not self.intent_models:
+            raise ValueError("模型未训练，请先调用fit方法")
+
+        intent_models = {
+            intent: model for intent, model in self.intent_models.items() if species in intent
+        }
+        if not intent_models:
+            return {
+                "winner": "",
+                "confidence": 0,
+                "probabilities": {}
+            }
+
+        if features.ndim == 1:
+            features_2d = features.reshape(1, -1)  # 添加样本维度，变为 (1, n_features)
+            print(f"🔧 特征维度调整: {features.shape} -> {features_2d.shape}")
+        elif features.ndim == 2:
+            features_2d = features
+        else:
+            # 高维特征展平
+            features_2d = features.flatten().reshape(1, -1)
+            print(f"🔧 高维特征展平: {features.shape} -> {features_2d.shape}")
+
+        if np.any(np.isnan(features_2d)) or np.any(np.isinf(features_2d)):
+            print(f"⚠️ 输入特征包含NaN或Inf值")
+            # 清理异常值
+            features_2d = np.nan_to_num(features_2d, nan=0.0, posinf=1e6, neginf=-1e6)
+            print(f"🔧 异常值已清理")
+        # HMMlearn 的 score 方法期望二维数组 (n_samples, n_features) 和对应的长度列表
+        # feature_length = len(features_2d.shape)
+        feature_max = np.max(np.abs(features_2d))
+        if feature_max > 1e6:
+            print(f"⚠️ 特征值过大: {feature_max}")
+            features_2d = np.clip(features_2d, -1e6, 1e6)
+            print(f"🔧 特征值已裁剪到合理范围")
+        print(f"🔍 输入特征统计: shape={features_2d.shape}, mean={np.mean(features_2d):.3f}, std={np.std(features_2d):.3f}, range=[{np.min(features_2d):.3f}, {np.max(features_2d):.3f}]")
+
+        scores = {}
+
+        for class_name, model in intent_models.items():
+            print(f"🔍 {class_name} 模型协方差矩阵行列式:")
+            if hasattr(model, 'covars_'):
+                for i, covar in enumerate(model.covars_):
+                    if self.covariance_type == "diag":
+                        det = np.prod(covar)  # 对角矩阵的行列式是对角元素的乘积
+                    else:
+                        det = np.linalg.det(covar)
+                    print(f"  状态 {i}: det = {det}")
+                    if det <= 0:
+                        print(f"  ⚠️ 状态 {i} 协方差矩阵奇异！")
+            try:
+                # 确保模型状态（特别是转移矩阵和初始概率）在计算分数前是有效的
+                # 所以这里需要先检查属性是否存在
+                # model = self._fix_transition_matrix(model, model_name=f"意图 {class_name} 预测")
+                # model = self._fix_startprob(model, model_name=f"意图 {class_name} 预测")
+
+                # 计算对数似然分数
+                score = model.score(features_2d, [1])
+                scores[class_name] = score
+            except Exception as e:
+                print(f"❌ 计算意图 \'{class_name}\' 对数似然失败: {e}")
+                scores[class_name] = -np.inf # 无法计算分数，设为负无穷
+
+        # 将对数似然转换为概率 (使用 log-sum-exp 技巧)
+        log_scores = np.array(list(scores.values()))
+        class_names_ordered = list(scores.keys())
+
+        if len(log_scores) == 0 or np.all(log_scores == -np.inf):
+            return {"winner": "unknown", "confidence": 0.0, "probabilities": {}}
+
+        max_log_score = np.max(log_scores)
+        if max_log_score <= 0:
+            return {
+                "winner": "",
+                "confidence": max_log_score,
+                "probabilities": dict(zip(class_names_ordered, log_scores.tolist()))
+            }
+        # 减去最大值以避免指数溢出
+        exp_scores = np.exp(log_scores - max_log_score)
+        probabilities = exp_scores / np.sum(exp_scores)
+
+        # 找到最高概率的意图
+        winner_idx = np.argmax(probabilities)
+        winner_class = class_names_ordered[winner_idx]
+        confidence = probabilities[winner_idx]
+
+        return {
+            "winner": winner_class,
+            "confidence": max_log_score,
+            "probabilities": dict(zip(class_names_ordered, probabilities.tolist()))
+        }
+
+    def evaluate(self, features_list: List[np.ndarray], labels: List[str]) -> Dict[str, float]:
+        """
+        评估模型性能
+
+        参数:
+            features_list: 特征列表
+            labels: 标签列表
+
+        返回:
+            metrics: 评估指标
+        """
+        if not self.intent_models:
+            raise ValueError("模型未训练，请先调用fit方法")
+
+        print("📊 评估模型性能...")
+
+        predictions = []
+        for features in features_list:
+            result = self.predict(features)
+            predictions.append(result["winner"])
+
+        from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+        accuracy = accuracy_score(labels, predictions)
+        precision, recall, f1, _ = precision_recall_fscore_support(
+            labels, predictions, average="weighted", zero_division=0
+        )
+
+        metrics = {
+            "accuracy": accuracy,
+            "precision": precision,
+            "recall": recall,
+            "f1": f1
+        }
+
+        print(f"✅ 评估完成，准确率: {metrics['accuracy']:.4f}")
+        return metrics
+
+    def save_model(self, model_dir: str, model_name: str = "enhanced_dag_hmm_v2_classifier") -> Dict[str, str]:
+        """
+        保存模型
+
+        参数:
+            model_dir: 模型保存目录
+            model_name: 模型名称
+
+        返回:
+            paths: 保存路径字典
+        """
+        os.makedirs(model_dir, exist_ok=True)
+
+        # 保存每个意图的HMM模型
+        model_paths = {}
+        for class_name, model in self.intent_models.items():
+            model_path = os.path.join(model_dir, f"{model_name}_{class_name}.pkl")
+            with open(model_path, "wb") as f:
+                pickle.dump(model, f)
+            model_paths[class_name] = model_path
+
+        # 保存label encoder和class names
+        label_encoder_path = os.path.join(model_dir, f"{model_name}_label_encoder.pkl")
+        with open(label_encoder_path, "wb") as f:
+            pickle.dump(self.label_encoder, f)
+
+        class_names_path = os.path.join(model_dir, f"{model_name}_class_names.json")
+        with open(class_names_path, "w") as f:
+            json.dump(self.class_names, f)
+
+        # 保存scaler
+        scaler_path = os.path.join(model_dir, f"{model_name}_scaler.pkl")
+        with open(scaler_path, "wb") as f:
+            pickle.dump(self.scaler, f)
+
+        print(f"💾 模型已保存到: {model_dir}")
+        return {"intent_models": model_paths, "label_encoder": label_encoder_path, "class_names": class_names_path, "scaler": scaler_path}
+
+    def load_model(self, model_dir: str, model_name: str = "enhanced_dag_hmm_v2_classifier") -> None:
+        """
+        加载模型
+
+        参数:
+            model_dir: 模型目录
+            model_name: 模型名称
+        """
+        # 加载label encoder和class names
+        label_encoder_path = os.path.join(model_dir, f"{model_name}_label_encoder.pkl")
+        if not os.path.exists(label_encoder_path):
+            raise FileNotFoundError(f"Label encoder文件不存在: {label_encoder_path}")
+        with open(label_encoder_path, "rb") as f:
+            self.label_encoder = pickle.load(f)
+        self.class_names = list(self.label_encoder.classes_)
+
+        # 加载scaler
+        scaler_path = os.path.join(model_dir, f"{model_name}_scaler.pkl")
+        if not os.path.exists(scaler_path):
+            raise FileNotFoundError(f"Scaler文件不存在: {scaler_path}")
+        with open(scaler_path, "rb") as f:
+            self.scaler = pickle.load(f)
+
+        # 加载每个意图的HMM模型
+        self.intent_models = {}
+        for class_name in self.class_names:
+            model_path = os.path.join(model_dir, f"{model_name}_{class_name}.pkl")
+            if not os.path.exists(model_path):
+                print(f"⚠️ 意图 \'{class_name}\' 的模型文件不存在: {model_path}，跳过加载。")
+                continue
+            with open(model_path, "rb") as f:
+                model = pickle.load(f)
+                # 修复加载模型的转移矩阵和初始概率
+                model = self._fix_transition_matrix(model, model_name=f"加载的 {class_name} 模型")
+                model = self._fix_startprob(model, model_name=f"加载的 {class_name} 模型")
+                self.intent_models[class_name] = model
+
+        self.is_trained = True
+        print(f"📂 模型已从 {model_dir} 加载")