746 lines
31 KiB
Python
746 lines
31 KiB
Python
"""
|
||
优化版DAG-HMM分类器模块 - 基于米兰大学论文Algorithm 1的改进实现
|
||
|
||
主要修复:
|
||
1. 添加转移矩阵验证和修复方法
|
||
2. 改进HMM参数设置
|
||
3. 增强错误处理机制
|
||
4. 优化特征处理流程
|
||
5. 修复意图分类分数异常问题:为每个意图训练独立的HMM模型,并使用softmax进行概率归一化。
|
||
"""
|
||
|
||
import os
|
||
import numpy as np
|
||
import json
|
||
import pickle
|
||
from typing import Dict, Any, List, Optional, Tuple
|
||
from hmmlearn import hmm
|
||
import networkx as nx
|
||
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
||
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
|
||
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
||
from itertools import combinations
|
||
import warnings
|
||
warnings.filterwarnings("ignore")
|
||
|
||
class DAGHMMClassifier:
|
||
"""
|
||
修复版DAG-HMM分类器 - 解决转移矩阵问题和意图分类分数问题
|
||
|
||
主要修复:
|
||
- HMM转移矩阵零行问题
|
||
- 参数设置优化
|
||
- 错误处理增强
|
||
- 为每个意图训练独立的HMM模型,并使用softmax进行概率归一化
|
||
"""
|
||
|
||
def __init__(self,
|
||
max_states: int = 1,
|
||
max_gaussians: int = 1,
|
||
covariance_type: str = "diag",
|
||
n_iter: int = 100,
|
||
random_state: int = 42,
|
||
cv_folds: int = 5):
|
||
"""
|
||
初始化修复版DAG-HMM分类器
|
||
|
||
参数:
|
||
max_states: 最大隐状态数量(减少以避免稀疏问题)
|
||
max_gaussians: 最大高斯混合成分数(减少以避免过拟合)
|
||
covariance_type: 协方差类型(使用diag避免参数过多)
|
||
n_iter: 训练迭代次数(减少以避免过拟合)
|
||
random_state: 随机种子
|
||
cv_folds: 交叉验证折数
|
||
"""
|
||
self.max_states = self._validate_positive_integer(max_states, "max_states")
|
||
self.max_gaussians = self._validate_positive_integer(max_gaussians, "max_gaussians")
|
||
self.covariance_type = covariance_type
|
||
self.n_iter = n_iter
|
||
self.random_state = random_state
|
||
self.cv_folds = cv_folds
|
||
|
||
# 模型组件
|
||
self.intent_models = {} # 存储每个意图的独立HMM模型
|
||
self.class_names = []
|
||
self.label_encoder = None
|
||
self.scaler = StandardScaler()
|
||
|
||
print("✅ 修复版DAG-HMM分类器已初始化(对数似然修复版)")
|
||
|
||
def _validate_positive_integer(self, value: Any, param_name: str) -> int:
|
||
"""验证并转换为正整数"""
|
||
try:
|
||
int_value = int(value)
|
||
if int_value <= 0:
|
||
raise ValueError(f"{param_name} 必须是正整数,得到: {int_value}")
|
||
return int_value
|
||
except (ValueError, TypeError) as e:
|
||
raise ValueError(f"无法将 {param_name} 转换为正整数: {value}, 错误: {e}")
|
||
|
||
def _fix_transition_matrix(self, model, model_name="HMM"):
|
||
"""
|
||
修复HMM转移矩阵中的零行问题
|
||
|
||
参数:
|
||
model: HMM模型
|
||
model_name: 模型名称(用于日志)
|
||
|
||
返回:
|
||
修复后的模型
|
||
"""
|
||
try:
|
||
# 检查转移矩阵
|
||
transmat = model.transmat_
|
||
|
||
# 如果模型状态数 n_components 为 0,直接返回模型,避免除以零的错误
|
||
if model.n_components == 0:
|
||
print(f"⚠️ {model_name}: 模型状态数 n_components 为 0,无法修复转移矩阵。")
|
||
return model
|
||
|
||
# 找到和为0的行
|
||
row_sums = np.sum(transmat, axis=1)
|
||
zero_rows = np.where(np.abs(row_sums) < 1e-10)[0] # 使用小阈值检测零行
|
||
|
||
if len(zero_rows) > 0:
|
||
print(f"🔧 {model_name}: 发现 {len(zero_rows)} 个零和行,正在修复...")
|
||
n_states = transmat.shape[1]
|
||
|
||
for row_idx in zero_rows:
|
||
# 尝试均匀分布,或者设置一个小的非零值
|
||
# 确保即使 n_states 为 0 也不会出错
|
||
if n_states > 0:
|
||
transmat[row_idx, :] = 1.0 / n_states
|
||
else:
|
||
# 如果状态数为0,这不应该发生,但作为极端情况处理
|
||
transmat[row_idx, :] = 0.0 # 无法有效修复
|
||
|
||
# 在归一化之前,为每一行添加一个小的 epsilon,防止出现全零行
|
||
epsilon = 1e-10
|
||
transmat += epsilon
|
||
|
||
# 确保所有行和为1,并处理可能出现的NaN或inf
|
||
for i in range(transmat.shape[0]):
|
||
row_sum = np.sum(transmat[i, :])
|
||
if row_sum > 0 and not np.isnan(row_sum) and not np.isinf(row_sum):
|
||
transmat[i, :] /= row_sum
|
||
else:
|
||
# 如果行和为0或NaN/inf,则设置为均匀分布
|
||
if transmat.shape[1] > 0:
|
||
transmat[i, :] = 1.0 / transmat.shape[1]
|
||
else:
|
||
transmat[i, :] = 0.0
|
||
|
||
model.transmat_ = transmat
|
||
print(f"✅ {model_name}: 转移矩阵修复完成")
|
||
|
||
# 验证修复结果
|
||
final_row_sums = np.sum(model.transmat_, axis=1)
|
||
if not np.allclose(final_row_sums, 1.0, atol=1e-6):
|
||
print(f"⚠️ {model_name}: 转移矩阵行和验证失败: {final_row_sums}")
|
||
# 强制归一化,再次处理可能出现的NaN或inf
|
||
for i in range(model.transmat_.shape[0]):
|
||
row_sum = np.sum(model.transmat_[i, :])
|
||
if row_sum > 0 and not np.isnan(row_sum) and not np.isinf(row_sum):
|
||
model.transmat_[i, :] /= row_sum
|
||
else:
|
||
if model.transmat_.shape[1] > 0:
|
||
model.transmat_[i, :] = 1.0 / model.transmat_.shape[1]
|
||
else:
|
||
model.transmat_[i, :] = 0.0
|
||
print(f"🔧 {model_name}: 强制归一化完成")
|
||
|
||
return model
|
||
|
||
except Exception as e:
|
||
print(f"❌ {model_name}: 转移矩阵修复失败: {e}")
|
||
return model
|
||
|
||
def _fix_startprob(self, model, model_name="HMM"):
|
||
"""
|
||
修复HMM初始概率中的NaN或零和问题
|
||
|
||
参数:
|
||
model: HMM模型
|
||
model_name: 模型名称(用于日志)
|
||
|
||
返回:
|
||
修复后的模型
|
||
"""
|
||
try:
|
||
startprob = model.startprob_
|
||
|
||
# 检查是否存在NaN或inf
|
||
if np.any(np.isnan(startprob)) or np.any(np.isinf(startprob)):
|
||
print(f"🔧 {model_name}: 发现初始概率包含NaN或inf,正在修复...")
|
||
# 重新初始化为均匀分布
|
||
model.startprob_ = np.full(model.n_components, 1.0 / model.n_components)
|
||
print(f"✅ {model_name}: 初始概率修复完成(均匀分布)。")
|
||
return model
|
||
|
||
# 检查和是否为1
|
||
startprob_sum = np.sum(startprob)
|
||
if not np.allclose(startprob_sum, 1.0, atol=1e-6):
|
||
print(f"🔧 {model_name}: 初始概率和不为1 ({startprob_sum}),正在修复...")
|
||
if startprob_sum > 0:
|
||
model.startprob_ = startprob / startprob_sum
|
||
else:
|
||
# 如果和为0,则重新初始化为均匀分布
|
||
model.startprob_ = np.full(model.n_components, 1.0 / model.n_components)
|
||
print(f"✅ {model_name}: 初始概率修复完成(归一化)。")
|
||
|
||
return model
|
||
|
||
except Exception as e:
|
||
print(f"❌ {model_name}: 初始概率修复失败: {e}")
|
||
return model
|
||
|
||
def _validate_hmm_model(self, model, model_name="HMM"):
|
||
"""
|
||
验证HMM模型的有效性
|
||
|
||
参数:
|
||
model: HMM模型
|
||
model_name: 模型名称
|
||
|
||
返回:
|
||
是否有效
|
||
"""
|
||
try:
|
||
# 检查转移矩阵
|
||
if hasattr(model, 'transmat_'):
|
||
transmat = model.transmat_
|
||
row_sums = np.sum(transmat, axis=1)
|
||
|
||
# 检查是否有零行
|
||
if np.any(np.abs(row_sums) < 1e-10):
|
||
print(f"⚠️ {model_name}: 转移矩阵存在零行")
|
||
return False
|
||
|
||
# 检查行和是否为1
|
||
if not np.allclose(row_sums, 1.0, atol=1e-6):
|
||
print(f"⚠️ {model_name}: 转移矩阵行和不为1: {row_sums}")
|
||
return False
|
||
|
||
# 检查起始概率
|
||
if hasattr(model, 'startprob_'):
|
||
startprob_sum = np.sum(model.startprob_)
|
||
if not np.allclose(startprob_sum, 1.0, atol=1e-6):
|
||
print(f"⚠️ {model_name}: 起始概率和不为1: {startprob_sum}")
|
||
return False
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f"❌ {model_name}: 模型验证失败: {e}")
|
||
return False
|
||
|
||
def _create_robust_hmm_model(self, n_states, n_gaussians, random_state=None):
|
||
"""
|
||
创建鲁棒的HMM模型
|
||
|
||
参数:
|
||
n_states: 状态数
|
||
n_gaussians: 高斯数
|
||
random_state: 随机种子
|
||
|
||
返回:
|
||
HMM模型
|
||
"""
|
||
if random_state is None:
|
||
random_state = self.random_state
|
||
|
||
# 确保参数合理
|
||
n_states = 1 # 限制状态数
|
||
n_gaussians = 1 # 高斯数不超过状态数
|
||
|
||
model = hmm.GMMHMM(
|
||
n_components=n_states,
|
||
n_mix=n_gaussians,
|
||
covariance_type=self.covariance_type,
|
||
n_iter=self.n_iter,
|
||
random_state=random_state,
|
||
tol=1e-2,
|
||
min_covar=1e-2,
|
||
init_params='stmc',
|
||
params='stmc'
|
||
)
|
||
print(f"创建HMM模型: 状态数={n_states}, 高斯数={n_gaussians}, 迭代={self.n_iter}")
|
||
|
||
return model
|
||
|
||
|
||
def _normalize_feature_dimensions(self, feature_vectors: List) -> Tuple[np.ndarray, List[int]]:
|
||
"""
|
||
标准化特征维度(修复版,保留时间维度)
|
||
|
||
返回:
|
||
normalized_array: 标准化后的三维数组 (n_samples, n_timesteps, n_features)
|
||
lengths: 每个样本的有效长度列表
|
||
"""
|
||
if not feature_vectors:
|
||
return np.array([]), []
|
||
|
||
processed_features = []
|
||
lengths = []
|
||
|
||
# 第一步:统一格式并提取所有特征用于拟合标准化器
|
||
all_features = [] # 收集所有特征用于计算均值和方差
|
||
for features in feature_vectors:
|
||
if isinstance(features, dict):
|
||
# 处理字典格式特征(时间步为键)
|
||
time_steps = sorted([int(k) for k in features.keys() if k.isdigit()])
|
||
if time_steps:
|
||
feature_sequence = []
|
||
for t in time_steps:
|
||
step_features = features[str(t)]
|
||
if isinstance(step_features, (list, np.ndarray)):
|
||
step_array = np.array(step_features).flatten()
|
||
feature_sequence.append(step_array)
|
||
all_features.append(step_array) # 收集用于标准化
|
||
|
||
if feature_sequence:
|
||
processed_features.append(np.array(feature_sequence))
|
||
lengths.append(len(feature_sequence))
|
||
else:
|
||
# 空序列处理
|
||
processed_features.append(np.array([[0.0]]))
|
||
lengths.append(1)
|
||
all_features.append(np.array([0.0]))
|
||
else:
|
||
# 没有时间步信息,当作单步处理
|
||
feature_array = np.array(list(features.values())).flatten()
|
||
processed_features.append(feature_array.reshape(1, -1))
|
||
lengths.append(1)
|
||
all_features.append(feature_array)
|
||
|
||
elif isinstance(features, (list, np.ndarray)):
|
||
feature_array = np.array(features)
|
||
if feature_array.ndim == 1:
|
||
# 一维特征,当作单时间步
|
||
processed_features.append(feature_array.reshape(1, -1))
|
||
lengths.append(1)
|
||
all_features.append(feature_array)
|
||
elif feature_array.ndim == 2:
|
||
# 二维特征,假设是 (time_steps, features)
|
||
processed_features.append(feature_array)
|
||
lengths.append(feature_array.shape[0])
|
||
for t in range(feature_array.shape[0]):
|
||
all_features.append(feature_array[t])
|
||
else:
|
||
# 高维特征,展平处理
|
||
flattened = feature_array.flatten()
|
||
processed_features.append(flattened.reshape(1, -1))
|
||
lengths.append(1)
|
||
all_features.append(flattened)
|
||
else:
|
||
# 其他类型,尝试转换
|
||
try:
|
||
feature_array = np.array([features]).flatten()
|
||
processed_features.append(feature_array.reshape(1, -1))
|
||
lengths.append(1)
|
||
all_features.append(feature_array)
|
||
except:
|
||
# 转换失败,使用零向量
|
||
processed_features.append(np.array([[0.0]]))
|
||
lengths.append(1)
|
||
all_features.append(np.array([0.0]))
|
||
|
||
if not processed_features:
|
||
return np.array([]), []
|
||
|
||
# 第二步:确定统一的特征维度
|
||
feature_dims = [f.shape[1] for f in processed_features]
|
||
unique_dims = list(set(feature_dims))
|
||
|
||
if len(unique_dims) > 1:
|
||
# 特征维度不一致,需要统一
|
||
target_dim = max(set(feature_dims), key=feature_dims.count) # 使用最常见的维度
|
||
print(f"🔧 特征维度分布: {set(feature_dims)}, 目标维度: {target_dim}")
|
||
|
||
# 统一特征维度
|
||
unified_features = []
|
||
for features in processed_features:
|
||
current_dim = features.shape[1]
|
||
if current_dim < target_dim:
|
||
# 填充
|
||
padding_size = target_dim - current_dim
|
||
padding = np.zeros((features.shape[0], padding_size))
|
||
unified_features.append(np.concatenate([features, padding], axis=1))
|
||
elif current_dim > target_dim:
|
||
# 截断
|
||
unified_features.append(features[:, :target_dim])
|
||
else:
|
||
unified_features.append(features)
|
||
processed_features = unified_features
|
||
|
||
# 第三步:统一时间步长度
|
||
max_length = max(lengths)
|
||
min_length = min(lengths)
|
||
|
||
if max_length != min_length:
|
||
# 时间步长度不一致,需要填充
|
||
target_length = min(max_length, 50) # 限制最大长度避免内存问题
|
||
|
||
padded_features = []
|
||
adjusted_lengths = []
|
||
|
||
for i, features in enumerate(processed_features):
|
||
current_length = lengths[i]
|
||
if current_length < target_length:
|
||
# 填充时间步
|
||
padding_steps = target_length - current_length
|
||
if current_length > 0:
|
||
# 使用最后一个时间步的值进行填充
|
||
last_step = features[-1:].repeat(padding_steps, axis=0)
|
||
padded_features.append(np.concatenate([features, last_step], axis=0))
|
||
else:
|
||
# 如果原序列为空,用零填充
|
||
zero_padding = np.zeros((target_length, features.shape[1]))
|
||
padded_features.append(zero_padding)
|
||
adjusted_lengths.append(target_length)
|
||
elif current_length > target_length:
|
||
# 截断时间步
|
||
padded_features.append(features[:target_length])
|
||
adjusted_lengths.append(target_length)
|
||
else:
|
||
padded_features.append(features)
|
||
adjusted_lengths.append(current_length)
|
||
|
||
processed_features = padded_features
|
||
lengths = adjusted_lengths
|
||
|
||
# 第四步:转换为三维数组并标准化
|
||
if processed_features:
|
||
dims = [f.shape[1] for f in processed_features]
|
||
print(f"特征维度分布: {dims}, 平均维度: {np.mean(dims):.1f}")
|
||
|
||
# 堆叠为三维数组
|
||
X = np.array(processed_features) # (n_samples, n_timesteps, n_features)
|
||
X_flat = X.reshape(-1, X.shape[-1])
|
||
# 检查 X_flat 是否为空,以及是否存在非零标准差的特征
|
||
if X_flat.shape[0] > 0 and np.any(np.std(X_flat, axis=0) > 1e-8):
|
||
self.scaler.fit(X_flat)
|
||
normalized_X_flat = self.scaler.transform(X_flat)
|
||
normalized_X = normalized_X_flat.reshape(X.shape)
|
||
else:
|
||
# 如果所有特征的标准差都为零,或者 X_flat 为空,则不进行标准化
|
||
normalized_X = X
|
||
return normalized_X, lengths
|
||
return np.array([]), []
|
||
|
||
def fit(self, features_list: List[np.ndarray], labels: List[str]) -> Dict[str, Any]:
|
||
"""
|
||
训练DAG-HMM分类器
|
||
|
||
参数:
|
||
features_list: 特征列表
|
||
labels: 标签列表
|
||
|
||
返回:
|
||
训练指标字典
|
||
"""
|
||
print("🚀 开始训练修复版DAG-HMM分类器...")
|
||
print(f"样本数量: {len(features_list)}")
|
||
print(f"类别数量: {len(set(labels))}")
|
||
|
||
# 编码标签
|
||
self.label_encoder = LabelEncoder()
|
||
encoded_labels = self.label_encoder.fit_transform(labels)
|
||
self.class_names = list(self.label_encoder.classes_)
|
||
|
||
print("📋 类别名称:", self.class_names)
|
||
for i, class_name in enumerate(self.class_names):
|
||
count = np.sum(np.array(labels) == class_name)
|
||
print(f"📈 类别 \'{class_name}\' : {count} 个样本")
|
||
|
||
# 按类别组织特征
|
||
features_by_class = {}
|
||
for class_name in self.class_names:
|
||
class_indices = [i for i, label in enumerate(labels) if label == class_name]
|
||
features_by_class[class_name] = [features_list[i] for i in class_indices]
|
||
|
||
# 为每个意图训练一个独立的HMM模型
|
||
self.intent_models = {}
|
||
for class_name, class_features in features_by_class.items():
|
||
print(f"🎯 训练意图 \'{class_name}\' 的HMM模型...")
|
||
|
||
class_indices = np.where(encoded_labels == self.label_encoder.transform([class_name])[0])[0]
|
||
class_features = [features_list[i] for i in class_indices]
|
||
if len(class_features) == 0:
|
||
print(f"⚠️ 意图 '{class_name}' 没有训练样本,跳过。")
|
||
continue
|
||
|
||
cleaned_features = []
|
||
for features in class_features:
|
||
# 检查并清理异常值
|
||
if np.any(np.isnan(features)) or np.any(np.isinf(features)):
|
||
print(f"⚠️ 发现异常特征值,正在清理...")
|
||
features = np.nan_to_num(features, nan=0.0, posinf=1e6, neginf=-1e6)
|
||
|
||
# 确保特征值在合理范围内
|
||
features = np.clip(features, -1e6, 1e6)
|
||
cleaned_features.append(features)
|
||
# 转换为HMM训练格式
|
||
X_class = np.vstack(cleaned_features)
|
||
lengths_class = [len(f) for f in cleaned_features]
|
||
if np.any(np.isnan(X_class)) or np.any(np.isinf(X_class)):
|
||
print(f"❌ 意图 '{class_name}' 合并后仍有异常值")
|
||
continue
|
||
|
||
# X, lengths = self._normalize_feature_dimensions(class_features)
|
||
#
|
||
# if X.size == 0:
|
||
# print(f"⚠️ 意图 \'{class_name}\' 没有有效特征,跳过训练。")
|
||
# continue
|
||
|
||
# n_features = X.shape[2]
|
||
model = self._create_robust_hmm_model(self.max_states, self.max_gaussians, self.random_state)
|
||
|
||
# 将三维特征数据 (n_samples, n_timesteps, n_features) 转换为二维 (total_observations, n_features)
|
||
# 并确保 lengths 参数正确传递
|
||
# X_reshaped = X.reshape(-1, n_features)
|
||
model.fit(X_class, lengths_class)
|
||
# 在模型训练成功后,修复转移矩阵和初始概率
|
||
if hasattr(model, 'covars_'):
|
||
for i, covar in enumerate(model.covars_):
|
||
if np.any(np.isnan(covar)) or np.any(np.isinf(covar)):
|
||
print(f"❌ 意图 '{class_name}' 状态 {i} 协方差包含异常值")
|
||
# 强制修复协方差矩阵
|
||
if self.covariance_type == "diag":
|
||
covar[np.isnan(covar)] = 1e-3
|
||
covar[np.isinf(covar)] = 1e-3
|
||
covar[covar <= 0] = 1e-3
|
||
model.covars_[i] = covar
|
||
model = self._fix_transition_matrix(model, model_name=f"训练后的 {class_name} 模型")
|
||
model = self._fix_startprob(model, model_name=f"训练后的 {class_name} 模型")
|
||
self.intent_models[class_name] = model
|
||
print(f"✅ 意图 \'{class_name}\' HMM模型训练完成。")
|
||
|
||
print("🎉 训练完成!")
|
||
return {
|
||
"train_accuracy": 0.0,
|
||
"n_classes": len(self.class_names),
|
||
"classes": self.class_names,
|
||
"n_samples": len(features_list),
|
||
# "n_binary_tasks": len(self.dag_topology),
|
||
# "task_difficulties": self.task_difficulties
|
||
}
|
||
|
||
def predict(self, features: np.ndarray, species) -> Dict[str, Any]:
|
||
"""
|
||
预测音频的意图
|
||
|
||
参数:
|
||
features: 提取的特征
|
||
species: 物种
|
||
|
||
返回:
|
||
result: 预测结果
|
||
"""
|
||
if not self.intent_models:
|
||
raise ValueError("模型未训练,请先调用fit方法")
|
||
|
||
intent_models = {
|
||
intent: model for intent, model in self.intent_models.items() if species in intent
|
||
}
|
||
if not intent_models:
|
||
return {
|
||
"winner": "",
|
||
"confidence": 0,
|
||
"probabilities": {}
|
||
}
|
||
|
||
if features.ndim == 1:
|
||
features_2d = features.reshape(1, -1) # 添加样本维度,变为 (1, n_features)
|
||
print(f"🔧 特征维度调整: {features.shape} -> {features_2d.shape}")
|
||
elif features.ndim == 2:
|
||
features_2d = features
|
||
else:
|
||
# 高维特征展平
|
||
features_2d = features.flatten().reshape(1, -1)
|
||
print(f"🔧 高维特征展平: {features.shape} -> {features_2d.shape}")
|
||
|
||
if np.any(np.isnan(features_2d)) or np.any(np.isinf(features_2d)):
|
||
print(f"⚠️ 输入特征包含NaN或Inf值")
|
||
# 清理异常值
|
||
features_2d = np.nan_to_num(features_2d, nan=0.0, posinf=1e6, neginf=-1e6)
|
||
print(f"🔧 异常值已清理")
|
||
# HMMlearn 的 score 方法期望二维数组 (n_samples, n_features) 和对应的长度列表
|
||
# feature_length = len(features_2d.shape)
|
||
feature_max = np.max(np.abs(features_2d))
|
||
if feature_max > 1e6:
|
||
print(f"⚠️ 特征值过大: {feature_max}")
|
||
features_2d = np.clip(features_2d, -1e6, 1e6)
|
||
print(f"🔧 特征值已裁剪到合理范围")
|
||
print(f"🔍 输入特征统计: shape={features_2d.shape}, mean={np.mean(features_2d):.3f}, std={np.std(features_2d):.3f}, range=[{np.min(features_2d):.3f}, {np.max(features_2d):.3f}]")
|
||
|
||
scores = {}
|
||
|
||
for class_name, model in intent_models.items():
|
||
print(f"🔍 {class_name} 模型协方差矩阵行列式:")
|
||
if hasattr(model, 'covars_'):
|
||
for i, covar in enumerate(model.covars_):
|
||
if self.covariance_type == "diag":
|
||
det = np.prod(covar) # 对角矩阵的行列式是对角元素的乘积
|
||
else:
|
||
det = np.linalg.det(covar)
|
||
print(f" 状态 {i}: det = {det}")
|
||
if det <= 0:
|
||
print(f" ⚠️ 状态 {i} 协方差矩阵奇异!")
|
||
try:
|
||
# 确保模型状态(特别是转移矩阵和初始概率)在计算分数前是有效的
|
||
# 所以这里需要先检查属性是否存在
|
||
# model = self._fix_transition_matrix(model, model_name=f"意图 {class_name} 预测")
|
||
# model = self._fix_startprob(model, model_name=f"意图 {class_name} 预测")
|
||
|
||
# 计算对数似然分数
|
||
score = model.score(features_2d, [1])
|
||
scores[class_name] = score
|
||
except Exception as e:
|
||
print(f"❌ 计算意图 \'{class_name}\' 对数似然失败: {e}")
|
||
scores[class_name] = -np.inf # 无法计算分数,设为负无穷
|
||
|
||
# 将对数似然转换为概率 (使用 log-sum-exp 技巧)
|
||
log_scores = np.array(list(scores.values()))
|
||
class_names_ordered = list(scores.keys())
|
||
|
||
if len(log_scores) == 0 or np.all(log_scores == -np.inf):
|
||
return {"winner": "unknown", "confidence": 0.0, "probabilities": {}}
|
||
|
||
max_log_score = np.max(log_scores)
|
||
if max_log_score <= 0:
|
||
return {
|
||
"winner": "",
|
||
"confidence": max_log_score,
|
||
"probabilities": dict(zip(class_names_ordered, log_scores.tolist()))
|
||
}
|
||
# 减去最大值以避免指数溢出
|
||
exp_scores = np.exp(log_scores - max_log_score)
|
||
probabilities = exp_scores / np.sum(exp_scores)
|
||
|
||
# 找到最高概率的意图
|
||
winner_idx = np.argmax(probabilities)
|
||
winner_class = class_names_ordered[winner_idx]
|
||
confidence = probabilities[winner_idx]
|
||
|
||
return {
|
||
"winner": winner_class,
|
||
"confidence": max_log_score,
|
||
"probabilities": dict(zip(class_names_ordered, probabilities.tolist()))
|
||
}
|
||
|
||
def evaluate(self, features_list: List[np.ndarray], labels: List[str]) -> Dict[str, float]:
|
||
"""
|
||
评估模型性能
|
||
|
||
参数:
|
||
features_list: 特征列表
|
||
labels: 标签列表
|
||
|
||
返回:
|
||
metrics: 评估指标
|
||
"""
|
||
if not self.intent_models:
|
||
raise ValueError("模型未训练,请先调用fit方法")
|
||
|
||
print("📊 评估模型性能...")
|
||
|
||
predictions = []
|
||
for features in features_list:
|
||
result = self.predict(features)
|
||
predictions.append(result["winner"])
|
||
|
||
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
||
accuracy = accuracy_score(labels, predictions)
|
||
precision, recall, f1, _ = precision_recall_fscore_support(
|
||
labels, predictions, average="weighted", zero_division=0
|
||
)
|
||
|
||
metrics = {
|
||
"accuracy": accuracy,
|
||
"precision": precision,
|
||
"recall": recall,
|
||
"f1": f1
|
||
}
|
||
|
||
print(f"✅ 评估完成,准确率: {metrics['accuracy']:.4f}")
|
||
return metrics
|
||
|
||
def save_model(self, model_dir: str, model_name: str = "enhanced_dag_hmm_v2_classifier") -> Dict[str, str]:
|
||
"""
|
||
保存模型
|
||
|
||
参数:
|
||
model_dir: 模型保存目录
|
||
model_name: 模型名称
|
||
|
||
返回:
|
||
paths: 保存路径字典
|
||
"""
|
||
os.makedirs(model_dir, exist_ok=True)
|
||
|
||
# 保存每个意图的HMM模型
|
||
model_paths = {}
|
||
for class_name, model in self.intent_models.items():
|
||
model_path = os.path.join(model_dir, f"{model_name}_{class_name}.pkl")
|
||
with open(model_path, "wb") as f:
|
||
pickle.dump(model, f)
|
||
model_paths[class_name] = model_path
|
||
|
||
# 保存label encoder和class names
|
||
label_encoder_path = os.path.join(model_dir, f"{model_name}_label_encoder.pkl")
|
||
with open(label_encoder_path, "wb") as f:
|
||
pickle.dump(self.label_encoder, f)
|
||
|
||
class_names_path = os.path.join(model_dir, f"{model_name}_class_names.json")
|
||
with open(class_names_path, "w") as f:
|
||
json.dump(self.class_names, f)
|
||
|
||
# 保存scaler
|
||
scaler_path = os.path.join(model_dir, f"{model_name}_scaler.pkl")
|
||
with open(scaler_path, "wb") as f:
|
||
pickle.dump(self.scaler, f)
|
||
|
||
print(f"💾 模型已保存到: {model_dir}")
|
||
return {"intent_models": model_paths, "label_encoder": label_encoder_path, "class_names": class_names_path, "scaler": scaler_path}
|
||
|
||
def load_model(self, model_dir: str, model_name: str = "enhanced_dag_hmm_v2_classifier") -> None:
|
||
"""
|
||
加载模型
|
||
|
||
参数:
|
||
model_dir: 模型目录
|
||
model_name: 模型名称
|
||
"""
|
||
# 加载label encoder和class names
|
||
label_encoder_path = os.path.join(model_dir, f"{model_name}_label_encoder.pkl")
|
||
if not os.path.exists(label_encoder_path):
|
||
raise FileNotFoundError(f"Label encoder文件不存在: {label_encoder_path}")
|
||
with open(label_encoder_path, "rb") as f:
|
||
self.label_encoder = pickle.load(f)
|
||
self.class_names = list(self.label_encoder.classes_)
|
||
|
||
# 加载scaler
|
||
scaler_path = os.path.join(model_dir, f"{model_name}_scaler.pkl")
|
||
if not os.path.exists(scaler_path):
|
||
raise FileNotFoundError(f"Scaler文件不存在: {scaler_path}")
|
||
with open(scaler_path, "rb") as f:
|
||
self.scaler = pickle.load(f)
|
||
|
||
# 加载每个意图的HMM模型
|
||
self.intent_models = {}
|
||
for class_name in self.class_names:
|
||
model_path = os.path.join(model_dir, f"{model_name}_{class_name}.pkl")
|
||
if not os.path.exists(model_path):
|
||
print(f"⚠️ 意图 \'{class_name}\' 的模型文件不存在: {model_path},跳过加载。")
|
||
continue
|
||
with open(model_path, "rb") as f:
|
||
model = pickle.load(f)
|
||
# 修复加载模型的转移矩阵和初始概率
|
||
model = self._fix_transition_matrix(model, model_name=f"加载的 {class_name} 模型")
|
||
model = self._fix_startprob(model, model_name=f"加载的 {class_name} 模型")
|
||
self.intent_models[class_name] = model
|
||
|
||
self.is_trained = True
|
||
print(f"📂 模型已从 {model_dir} 加载")
|