feat: first commit
This commit is contained in:
745
src/dag_hmm_classifier.py
Normal file
745
src/dag_hmm_classifier.py
Normal file
@@ -0,0 +1,745 @@
|
||||
"""
|
||||
优化版DAG-HMM分类器模块 - 基于米兰大学论文Algorithm 1的改进实现
|
||||
|
||||
主要修复:
|
||||
1. 添加转移矩阵验证和修复方法
|
||||
2. 改进HMM参数设置
|
||||
3. 增强错误处理机制
|
||||
4. 优化特征处理流程
|
||||
5. 修复意图分类分数异常问题:为每个意图训练独立的HMM模型,并使用softmax进行概率归一化。
|
||||
"""
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
import json
|
||||
import pickle
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
from hmmlearn import hmm
|
||||
import networkx as nx
|
||||
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
||||
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
|
||||
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
||||
from itertools import combinations
|
||||
import warnings
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
class DAGHMMClassifier:
|
||||
"""
|
||||
修复版DAG-HMM分类器 - 解决转移矩阵问题和意图分类分数问题
|
||||
|
||||
主要修复:
|
||||
- HMM转移矩阵零行问题
|
||||
- 参数设置优化
|
||||
- 错误处理增强
|
||||
- 为每个意图训练独立的HMM模型,并使用softmax进行概率归一化
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
max_states: int = 1,
|
||||
max_gaussians: int = 1,
|
||||
covariance_type: str = "diag",
|
||||
n_iter: int = 100,
|
||||
random_state: int = 42,
|
||||
cv_folds: int = 5):
|
||||
"""
|
||||
初始化修复版DAG-HMM分类器
|
||||
|
||||
参数:
|
||||
max_states: 最大隐状态数量(减少以避免稀疏问题)
|
||||
max_gaussians: 最大高斯混合成分数(减少以避免过拟合)
|
||||
covariance_type: 协方差类型(使用diag避免参数过多)
|
||||
n_iter: 训练迭代次数(减少以避免过拟合)
|
||||
random_state: 随机种子
|
||||
cv_folds: 交叉验证折数
|
||||
"""
|
||||
self.max_states = self._validate_positive_integer(max_states, "max_states")
|
||||
self.max_gaussians = self._validate_positive_integer(max_gaussians, "max_gaussians")
|
||||
self.covariance_type = covariance_type
|
||||
self.n_iter = n_iter
|
||||
self.random_state = random_state
|
||||
self.cv_folds = cv_folds
|
||||
|
||||
# 模型组件
|
||||
self.intent_models = {} # 存储每个意图的独立HMM模型
|
||||
self.class_names = []
|
||||
self.label_encoder = None
|
||||
self.scaler = StandardScaler()
|
||||
|
||||
print("✅ 修复版DAG-HMM分类器已初始化(对数似然修复版)")
|
||||
|
||||
def _validate_positive_integer(self, value: Any, param_name: str) -> int:
|
||||
"""验证并转换为正整数"""
|
||||
try:
|
||||
int_value = int(value)
|
||||
if int_value <= 0:
|
||||
raise ValueError(f"{param_name} 必须是正整数,得到: {int_value}")
|
||||
return int_value
|
||||
except (ValueError, TypeError) as e:
|
||||
raise ValueError(f"无法将 {param_name} 转换为正整数: {value}, 错误: {e}")
|
||||
|
||||
def _fix_transition_matrix(self, model, model_name="HMM"):
|
||||
"""
|
||||
修复HMM转移矩阵中的零行问题
|
||||
|
||||
参数:
|
||||
model: HMM模型
|
||||
model_name: 模型名称(用于日志)
|
||||
|
||||
返回:
|
||||
修复后的模型
|
||||
"""
|
||||
try:
|
||||
# 检查转移矩阵
|
||||
transmat = model.transmat_
|
||||
|
||||
# 如果模型状态数 n_components 为 0,直接返回模型,避免除以零的错误
|
||||
if model.n_components == 0:
|
||||
print(f"⚠️ {model_name}: 模型状态数 n_components 为 0,无法修复转移矩阵。")
|
||||
return model
|
||||
|
||||
# 找到和为0的行
|
||||
row_sums = np.sum(transmat, axis=1)
|
||||
zero_rows = np.where(np.abs(row_sums) < 1e-10)[0] # 使用小阈值检测零行
|
||||
|
||||
if len(zero_rows) > 0:
|
||||
print(f"🔧 {model_name}: 发现 {len(zero_rows)} 个零和行,正在修复...")
|
||||
n_states = transmat.shape[1]
|
||||
|
||||
for row_idx in zero_rows:
|
||||
# 尝试均匀分布,或者设置一个小的非零值
|
||||
# 确保即使 n_states 为 0 也不会出错
|
||||
if n_states > 0:
|
||||
transmat[row_idx, :] = 1.0 / n_states
|
||||
else:
|
||||
# 如果状态数为0,这不应该发生,但作为极端情况处理
|
||||
transmat[row_idx, :] = 0.0 # 无法有效修复
|
||||
|
||||
# 在归一化之前,为每一行添加一个小的 epsilon,防止出现全零行
|
||||
epsilon = 1e-10
|
||||
transmat += epsilon
|
||||
|
||||
# 确保所有行和为1,并处理可能出现的NaN或inf
|
||||
for i in range(transmat.shape[0]):
|
||||
row_sum = np.sum(transmat[i, :])
|
||||
if row_sum > 0 and not np.isnan(row_sum) and not np.isinf(row_sum):
|
||||
transmat[i, :] /= row_sum
|
||||
else:
|
||||
# 如果行和为0或NaN/inf,则设置为均匀分布
|
||||
if transmat.shape[1] > 0:
|
||||
transmat[i, :] = 1.0 / transmat.shape[1]
|
||||
else:
|
||||
transmat[i, :] = 0.0
|
||||
|
||||
model.transmat_ = transmat
|
||||
print(f"✅ {model_name}: 转移矩阵修复完成")
|
||||
|
||||
# 验证修复结果
|
||||
final_row_sums = np.sum(model.transmat_, axis=1)
|
||||
if not np.allclose(final_row_sums, 1.0, atol=1e-6):
|
||||
print(f"⚠️ {model_name}: 转移矩阵行和验证失败: {final_row_sums}")
|
||||
# 强制归一化,再次处理可能出现的NaN或inf
|
||||
for i in range(model.transmat_.shape[0]):
|
||||
row_sum = np.sum(model.transmat_[i, :])
|
||||
if row_sum > 0 and not np.isnan(row_sum) and not np.isinf(row_sum):
|
||||
model.transmat_[i, :] /= row_sum
|
||||
else:
|
||||
if model.transmat_.shape[1] > 0:
|
||||
model.transmat_[i, :] = 1.0 / model.transmat_.shape[1]
|
||||
else:
|
||||
model.transmat_[i, :] = 0.0
|
||||
print(f"🔧 {model_name}: 强制归一化完成")
|
||||
|
||||
return model
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ {model_name}: 转移矩阵修复失败: {e}")
|
||||
return model
|
||||
|
||||
def _fix_startprob(self, model, model_name="HMM"):
|
||||
"""
|
||||
修复HMM初始概率中的NaN或零和问题
|
||||
|
||||
参数:
|
||||
model: HMM模型
|
||||
model_name: 模型名称(用于日志)
|
||||
|
||||
返回:
|
||||
修复后的模型
|
||||
"""
|
||||
try:
|
||||
startprob = model.startprob_
|
||||
|
||||
# 检查是否存在NaN或inf
|
||||
if np.any(np.isnan(startprob)) or np.any(np.isinf(startprob)):
|
||||
print(f"🔧 {model_name}: 发现初始概率包含NaN或inf,正在修复...")
|
||||
# 重新初始化为均匀分布
|
||||
model.startprob_ = np.full(model.n_components, 1.0 / model.n_components)
|
||||
print(f"✅ {model_name}: 初始概率修复完成(均匀分布)。")
|
||||
return model
|
||||
|
||||
# 检查和是否为1
|
||||
startprob_sum = np.sum(startprob)
|
||||
if not np.allclose(startprob_sum, 1.0, atol=1e-6):
|
||||
print(f"🔧 {model_name}: 初始概率和不为1 ({startprob_sum}),正在修复...")
|
||||
if startprob_sum > 0:
|
||||
model.startprob_ = startprob / startprob_sum
|
||||
else:
|
||||
# 如果和为0,则重新初始化为均匀分布
|
||||
model.startprob_ = np.full(model.n_components, 1.0 / model.n_components)
|
||||
print(f"✅ {model_name}: 初始概率修复完成(归一化)。")
|
||||
|
||||
return model
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ {model_name}: 初始概率修复失败: {e}")
|
||||
return model
|
||||
|
||||
def _validate_hmm_model(self, model, model_name="HMM"):
|
||||
"""
|
||||
验证HMM模型的有效性
|
||||
|
||||
参数:
|
||||
model: HMM模型
|
||||
model_name: 模型名称
|
||||
|
||||
返回:
|
||||
是否有效
|
||||
"""
|
||||
try:
|
||||
# 检查转移矩阵
|
||||
if hasattr(model, 'transmat_'):
|
||||
transmat = model.transmat_
|
||||
row_sums = np.sum(transmat, axis=1)
|
||||
|
||||
# 检查是否有零行
|
||||
if np.any(np.abs(row_sums) < 1e-10):
|
||||
print(f"⚠️ {model_name}: 转移矩阵存在零行")
|
||||
return False
|
||||
|
||||
# 检查行和是否为1
|
||||
if not np.allclose(row_sums, 1.0, atol=1e-6):
|
||||
print(f"⚠️ {model_name}: 转移矩阵行和不为1: {row_sums}")
|
||||
return False
|
||||
|
||||
# 检查起始概率
|
||||
if hasattr(model, 'startprob_'):
|
||||
startprob_sum = np.sum(model.startprob_)
|
||||
if not np.allclose(startprob_sum, 1.0, atol=1e-6):
|
||||
print(f"⚠️ {model_name}: 起始概率和不为1: {startprob_sum}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ {model_name}: 模型验证失败: {e}")
|
||||
return False
|
||||
|
||||
def _create_robust_hmm_model(self, n_states, n_gaussians, random_state=None):
|
||||
"""
|
||||
创建鲁棒的HMM模型
|
||||
|
||||
参数:
|
||||
n_states: 状态数
|
||||
n_gaussians: 高斯数
|
||||
random_state: 随机种子
|
||||
|
||||
返回:
|
||||
HMM模型
|
||||
"""
|
||||
if random_state is None:
|
||||
random_state = self.random_state
|
||||
|
||||
# 确保参数合理
|
||||
n_states = 1 # 限制状态数
|
||||
n_gaussians = 1 # 高斯数不超过状态数
|
||||
|
||||
model = hmm.GMMHMM(
|
||||
n_components=n_states,
|
||||
n_mix=n_gaussians,
|
||||
covariance_type=self.covariance_type,
|
||||
n_iter=self.n_iter,
|
||||
random_state=random_state,
|
||||
tol=1e-2,
|
||||
min_covar=1e-2,
|
||||
init_params='stmc',
|
||||
params='stmc'
|
||||
)
|
||||
print(f"创建HMM模型: 状态数={n_states}, 高斯数={n_gaussians}, 迭代={self.n_iter}")
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def _normalize_feature_dimensions(self, feature_vectors: List) -> Tuple[np.ndarray, List[int]]:
|
||||
"""
|
||||
标准化特征维度(修复版,保留时间维度)
|
||||
|
||||
返回:
|
||||
normalized_array: 标准化后的三维数组 (n_samples, n_timesteps, n_features)
|
||||
lengths: 每个样本的有效长度列表
|
||||
"""
|
||||
if not feature_vectors:
|
||||
return np.array([]), []
|
||||
|
||||
processed_features = []
|
||||
lengths = []
|
||||
|
||||
# 第一步:统一格式并提取所有特征用于拟合标准化器
|
||||
all_features = [] # 收集所有特征用于计算均值和方差
|
||||
for features in feature_vectors:
|
||||
if isinstance(features, dict):
|
||||
# 处理字典格式特征(时间步为键)
|
||||
time_steps = sorted([int(k) for k in features.keys() if k.isdigit()])
|
||||
if time_steps:
|
||||
feature_sequence = []
|
||||
for t in time_steps:
|
||||
step_features = features[str(t)]
|
||||
if isinstance(step_features, (list, np.ndarray)):
|
||||
step_array = np.array(step_features).flatten()
|
||||
feature_sequence.append(step_array)
|
||||
all_features.append(step_array) # 收集用于标准化
|
||||
|
||||
if feature_sequence:
|
||||
processed_features.append(np.array(feature_sequence))
|
||||
lengths.append(len(feature_sequence))
|
||||
else:
|
||||
# 空序列处理
|
||||
processed_features.append(np.array([[0.0]]))
|
||||
lengths.append(1)
|
||||
all_features.append(np.array([0.0]))
|
||||
else:
|
||||
# 没有时间步信息,当作单步处理
|
||||
feature_array = np.array(list(features.values())).flatten()
|
||||
processed_features.append(feature_array.reshape(1, -1))
|
||||
lengths.append(1)
|
||||
all_features.append(feature_array)
|
||||
|
||||
elif isinstance(features, (list, np.ndarray)):
|
||||
feature_array = np.array(features)
|
||||
if feature_array.ndim == 1:
|
||||
# 一维特征,当作单时间步
|
||||
processed_features.append(feature_array.reshape(1, -1))
|
||||
lengths.append(1)
|
||||
all_features.append(feature_array)
|
||||
elif feature_array.ndim == 2:
|
||||
# 二维特征,假设是 (time_steps, features)
|
||||
processed_features.append(feature_array)
|
||||
lengths.append(feature_array.shape[0])
|
||||
for t in range(feature_array.shape[0]):
|
||||
all_features.append(feature_array[t])
|
||||
else:
|
||||
# 高维特征,展平处理
|
||||
flattened = feature_array.flatten()
|
||||
processed_features.append(flattened.reshape(1, -1))
|
||||
lengths.append(1)
|
||||
all_features.append(flattened)
|
||||
else:
|
||||
# 其他类型,尝试转换
|
||||
try:
|
||||
feature_array = np.array([features]).flatten()
|
||||
processed_features.append(feature_array.reshape(1, -1))
|
||||
lengths.append(1)
|
||||
all_features.append(feature_array)
|
||||
except:
|
||||
# 转换失败,使用零向量
|
||||
processed_features.append(np.array([[0.0]]))
|
||||
lengths.append(1)
|
||||
all_features.append(np.array([0.0]))
|
||||
|
||||
if not processed_features:
|
||||
return np.array([]), []
|
||||
|
||||
# 第二步:确定统一的特征维度
|
||||
feature_dims = [f.shape[1] for f in processed_features]
|
||||
unique_dims = list(set(feature_dims))
|
||||
|
||||
if len(unique_dims) > 1:
|
||||
# 特征维度不一致,需要统一
|
||||
target_dim = max(set(feature_dims), key=feature_dims.count) # 使用最常见的维度
|
||||
print(f"🔧 特征维度分布: {set(feature_dims)}, 目标维度: {target_dim}")
|
||||
|
||||
# 统一特征维度
|
||||
unified_features = []
|
||||
for features in processed_features:
|
||||
current_dim = features.shape[1]
|
||||
if current_dim < target_dim:
|
||||
# 填充
|
||||
padding_size = target_dim - current_dim
|
||||
padding = np.zeros((features.shape[0], padding_size))
|
||||
unified_features.append(np.concatenate([features, padding], axis=1))
|
||||
elif current_dim > target_dim:
|
||||
# 截断
|
||||
unified_features.append(features[:, :target_dim])
|
||||
else:
|
||||
unified_features.append(features)
|
||||
processed_features = unified_features
|
||||
|
||||
# 第三步:统一时间步长度
|
||||
max_length = max(lengths)
|
||||
min_length = min(lengths)
|
||||
|
||||
if max_length != min_length:
|
||||
# 时间步长度不一致,需要填充
|
||||
target_length = min(max_length, 50) # 限制最大长度避免内存问题
|
||||
|
||||
padded_features = []
|
||||
adjusted_lengths = []
|
||||
|
||||
for i, features in enumerate(processed_features):
|
||||
current_length = lengths[i]
|
||||
if current_length < target_length:
|
||||
# 填充时间步
|
||||
padding_steps = target_length - current_length
|
||||
if current_length > 0:
|
||||
# 使用最后一个时间步的值进行填充
|
||||
last_step = features[-1:].repeat(padding_steps, axis=0)
|
||||
padded_features.append(np.concatenate([features, last_step], axis=0))
|
||||
else:
|
||||
# 如果原序列为空,用零填充
|
||||
zero_padding = np.zeros((target_length, features.shape[1]))
|
||||
padded_features.append(zero_padding)
|
||||
adjusted_lengths.append(target_length)
|
||||
elif current_length > target_length:
|
||||
# 截断时间步
|
||||
padded_features.append(features[:target_length])
|
||||
adjusted_lengths.append(target_length)
|
||||
else:
|
||||
padded_features.append(features)
|
||||
adjusted_lengths.append(current_length)
|
||||
|
||||
processed_features = padded_features
|
||||
lengths = adjusted_lengths
|
||||
|
||||
# 第四步:转换为三维数组并标准化
|
||||
if processed_features:
|
||||
dims = [f.shape[1] for f in processed_features]
|
||||
print(f"特征维度分布: {dims}, 平均维度: {np.mean(dims):.1f}")
|
||||
|
||||
# 堆叠为三维数组
|
||||
X = np.array(processed_features) # (n_samples, n_timesteps, n_features)
|
||||
X_flat = X.reshape(-1, X.shape[-1])
|
||||
# 检查 X_flat 是否为空,以及是否存在非零标准差的特征
|
||||
if X_flat.shape[0] > 0 and np.any(np.std(X_flat, axis=0) > 1e-8):
|
||||
self.scaler.fit(X_flat)
|
||||
normalized_X_flat = self.scaler.transform(X_flat)
|
||||
normalized_X = normalized_X_flat.reshape(X.shape)
|
||||
else:
|
||||
# 如果所有特征的标准差都为零,或者 X_flat 为空,则不进行标准化
|
||||
normalized_X = X
|
||||
return normalized_X, lengths
|
||||
return np.array([]), []
|
||||
|
||||
def fit(self, features_list: List[np.ndarray], labels: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
训练DAG-HMM分类器
|
||||
|
||||
参数:
|
||||
features_list: 特征列表
|
||||
labels: 标签列表
|
||||
|
||||
返回:
|
||||
训练指标字典
|
||||
"""
|
||||
print("🚀 开始训练修复版DAG-HMM分类器...")
|
||||
print(f"样本数量: {len(features_list)}")
|
||||
print(f"类别数量: {len(set(labels))}")
|
||||
|
||||
# 编码标签
|
||||
self.label_encoder = LabelEncoder()
|
||||
encoded_labels = self.label_encoder.fit_transform(labels)
|
||||
self.class_names = list(self.label_encoder.classes_)
|
||||
|
||||
print("📋 类别名称:", self.class_names)
|
||||
for i, class_name in enumerate(self.class_names):
|
||||
count = np.sum(np.array(labels) == class_name)
|
||||
print(f"📈 类别 \'{class_name}\' : {count} 个样本")
|
||||
|
||||
# 按类别组织特征
|
||||
features_by_class = {}
|
||||
for class_name in self.class_names:
|
||||
class_indices = [i for i, label in enumerate(labels) if label == class_name]
|
||||
features_by_class[class_name] = [features_list[i] for i in class_indices]
|
||||
|
||||
# 为每个意图训练一个独立的HMM模型
|
||||
self.intent_models = {}
|
||||
for class_name, class_features in features_by_class.items():
|
||||
print(f"🎯 训练意图 \'{class_name}\' 的HMM模型...")
|
||||
|
||||
class_indices = np.where(encoded_labels == self.label_encoder.transform([class_name])[0])[0]
|
||||
class_features = [features_list[i] for i in class_indices]
|
||||
if len(class_features) == 0:
|
||||
print(f"⚠️ 意图 '{class_name}' 没有训练样本,跳过。")
|
||||
continue
|
||||
|
||||
cleaned_features = []
|
||||
for features in class_features:
|
||||
# 检查并清理异常值
|
||||
if np.any(np.isnan(features)) or np.any(np.isinf(features)):
|
||||
print(f"⚠️ 发现异常特征值,正在清理...")
|
||||
features = np.nan_to_num(features, nan=0.0, posinf=1e6, neginf=-1e6)
|
||||
|
||||
# 确保特征值在合理范围内
|
||||
features = np.clip(features, -1e6, 1e6)
|
||||
cleaned_features.append(features)
|
||||
# 转换为HMM训练格式
|
||||
X_class = np.vstack(cleaned_features)
|
||||
lengths_class = [len(f) for f in cleaned_features]
|
||||
if np.any(np.isnan(X_class)) or np.any(np.isinf(X_class)):
|
||||
print(f"❌ 意图 '{class_name}' 合并后仍有异常值")
|
||||
continue
|
||||
|
||||
# X, lengths = self._normalize_feature_dimensions(class_features)
|
||||
#
|
||||
# if X.size == 0:
|
||||
# print(f"⚠️ 意图 \'{class_name}\' 没有有效特征,跳过训练。")
|
||||
# continue
|
||||
|
||||
# n_features = X.shape[2]
|
||||
model = self._create_robust_hmm_model(self.max_states, self.max_gaussians, self.random_state)
|
||||
|
||||
# 将三维特征数据 (n_samples, n_timesteps, n_features) 转换为二维 (total_observations, n_features)
|
||||
# 并确保 lengths 参数正确传递
|
||||
# X_reshaped = X.reshape(-1, n_features)
|
||||
model.fit(X_class, lengths_class)
|
||||
# 在模型训练成功后,修复转移矩阵和初始概率
|
||||
if hasattr(model, 'covars_'):
|
||||
for i, covar in enumerate(model.covars_):
|
||||
if np.any(np.isnan(covar)) or np.any(np.isinf(covar)):
|
||||
print(f"❌ 意图 '{class_name}' 状态 {i} 协方差包含异常值")
|
||||
# 强制修复协方差矩阵
|
||||
if self.covariance_type == "diag":
|
||||
covar[np.isnan(covar)] = 1e-3
|
||||
covar[np.isinf(covar)] = 1e-3
|
||||
covar[covar <= 0] = 1e-3
|
||||
model.covars_[i] = covar
|
||||
model = self._fix_transition_matrix(model, model_name=f"训练后的 {class_name} 模型")
|
||||
model = self._fix_startprob(model, model_name=f"训练后的 {class_name} 模型")
|
||||
self.intent_models[class_name] = model
|
||||
print(f"✅ 意图 \'{class_name}\' HMM模型训练完成。")
|
||||
|
||||
print("🎉 训练完成!")
|
||||
return {
|
||||
"train_accuracy": 0.0,
|
||||
"n_classes": len(self.class_names),
|
||||
"classes": self.class_names,
|
||||
"n_samples": len(features_list),
|
||||
# "n_binary_tasks": len(self.dag_topology),
|
||||
# "task_difficulties": self.task_difficulties
|
||||
}
|
||||
|
||||
def predict(self, features: np.ndarray, species) -> Dict[str, Any]:
|
||||
"""
|
||||
预测音频的意图
|
||||
|
||||
参数:
|
||||
features: 提取的特征
|
||||
species: 物种
|
||||
|
||||
返回:
|
||||
result: 预测结果
|
||||
"""
|
||||
if not self.intent_models:
|
||||
raise ValueError("模型未训练,请先调用fit方法")
|
||||
|
||||
intent_models = {
|
||||
intent: model for intent, model in self.intent_models.items() if species in intent
|
||||
}
|
||||
if not intent_models:
|
||||
return {
|
||||
"winner": "",
|
||||
"confidence": 0,
|
||||
"probabilities": {}
|
||||
}
|
||||
|
||||
if features.ndim == 1:
|
||||
features_2d = features.reshape(1, -1) # 添加样本维度,变为 (1, n_features)
|
||||
print(f"🔧 特征维度调整: {features.shape} -> {features_2d.shape}")
|
||||
elif features.ndim == 2:
|
||||
features_2d = features
|
||||
else:
|
||||
# 高维特征展平
|
||||
features_2d = features.flatten().reshape(1, -1)
|
||||
print(f"🔧 高维特征展平: {features.shape} -> {features_2d.shape}")
|
||||
|
||||
if np.any(np.isnan(features_2d)) or np.any(np.isinf(features_2d)):
|
||||
print(f"⚠️ 输入特征包含NaN或Inf值")
|
||||
# 清理异常值
|
||||
features_2d = np.nan_to_num(features_2d, nan=0.0, posinf=1e6, neginf=-1e6)
|
||||
print(f"🔧 异常值已清理")
|
||||
# HMMlearn 的 score 方法期望二维数组 (n_samples, n_features) 和对应的长度列表
|
||||
# feature_length = len(features_2d.shape)
|
||||
feature_max = np.max(np.abs(features_2d))
|
||||
if feature_max > 1e6:
|
||||
print(f"⚠️ 特征值过大: {feature_max}")
|
||||
features_2d = np.clip(features_2d, -1e6, 1e6)
|
||||
print(f"🔧 特征值已裁剪到合理范围")
|
||||
print(f"🔍 输入特征统计: shape={features_2d.shape}, mean={np.mean(features_2d):.3f}, std={np.std(features_2d):.3f}, range=[{np.min(features_2d):.3f}, {np.max(features_2d):.3f}]")
|
||||
|
||||
scores = {}
|
||||
|
||||
for class_name, model in intent_models.items():
|
||||
print(f"🔍 {class_name} 模型协方差矩阵行列式:")
|
||||
if hasattr(model, 'covars_'):
|
||||
for i, covar in enumerate(model.covars_):
|
||||
if self.covariance_type == "diag":
|
||||
det = np.prod(covar) # 对角矩阵的行列式是对角元素的乘积
|
||||
else:
|
||||
det = np.linalg.det(covar)
|
||||
print(f" 状态 {i}: det = {det}")
|
||||
if det <= 0:
|
||||
print(f" ⚠️ 状态 {i} 协方差矩阵奇异!")
|
||||
try:
|
||||
# 确保模型状态(特别是转移矩阵和初始概率)在计算分数前是有效的
|
||||
# 所以这里需要先检查属性是否存在
|
||||
# model = self._fix_transition_matrix(model, model_name=f"意图 {class_name} 预测")
|
||||
# model = self._fix_startprob(model, model_name=f"意图 {class_name} 预测")
|
||||
|
||||
# 计算对数似然分数
|
||||
score = model.score(features_2d, [1])
|
||||
scores[class_name] = score
|
||||
except Exception as e:
|
||||
print(f"❌ 计算意图 \'{class_name}\' 对数似然失败: {e}")
|
||||
scores[class_name] = -np.inf # 无法计算分数,设为负无穷
|
||||
|
||||
# 将对数似然转换为概率 (使用 log-sum-exp 技巧)
|
||||
log_scores = np.array(list(scores.values()))
|
||||
class_names_ordered = list(scores.keys())
|
||||
|
||||
if len(log_scores) == 0 or np.all(log_scores == -np.inf):
|
||||
return {"winner": "unknown", "confidence": 0.0, "probabilities": {}}
|
||||
|
||||
max_log_score = np.max(log_scores)
|
||||
if max_log_score <= 0:
|
||||
return {
|
||||
"winner": "",
|
||||
"confidence": max_log_score,
|
||||
"probabilities": dict(zip(class_names_ordered, log_scores.tolist()))
|
||||
}
|
||||
# 减去最大值以避免指数溢出
|
||||
exp_scores = np.exp(log_scores - max_log_score)
|
||||
probabilities = exp_scores / np.sum(exp_scores)
|
||||
|
||||
# 找到最高概率的意图
|
||||
winner_idx = np.argmax(probabilities)
|
||||
winner_class = class_names_ordered[winner_idx]
|
||||
confidence = probabilities[winner_idx]
|
||||
|
||||
return {
|
||||
"winner": winner_class,
|
||||
"confidence": max_log_score,
|
||||
"probabilities": dict(zip(class_names_ordered, probabilities.tolist()))
|
||||
}
|
||||
|
||||
def evaluate(self, features_list: List[np.ndarray], labels: List[str]) -> Dict[str, float]:
|
||||
"""
|
||||
评估模型性能
|
||||
|
||||
参数:
|
||||
features_list: 特征列表
|
||||
labels: 标签列表
|
||||
|
||||
返回:
|
||||
metrics: 评估指标
|
||||
"""
|
||||
if not self.intent_models:
|
||||
raise ValueError("模型未训练,请先调用fit方法")
|
||||
|
||||
print("📊 评估模型性能...")
|
||||
|
||||
predictions = []
|
||||
for features in features_list:
|
||||
result = self.predict(features)
|
||||
predictions.append(result["winner"])
|
||||
|
||||
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
||||
accuracy = accuracy_score(labels, predictions)
|
||||
precision, recall, f1, _ = precision_recall_fscore_support(
|
||||
labels, predictions, average="weighted", zero_division=0
|
||||
)
|
||||
|
||||
metrics = {
|
||||
"accuracy": accuracy,
|
||||
"precision": precision,
|
||||
"recall": recall,
|
||||
"f1": f1
|
||||
}
|
||||
|
||||
print(f"✅ 评估完成,准确率: {metrics['accuracy']:.4f}")
|
||||
return metrics
|
||||
|
||||
def save_model(self, model_dir: str, model_name: str = "enhanced_dag_hmm_v2_classifier") -> Dict[str, str]:
|
||||
"""
|
||||
保存模型
|
||||
|
||||
参数:
|
||||
model_dir: 模型保存目录
|
||||
model_name: 模型名称
|
||||
|
||||
返回:
|
||||
paths: 保存路径字典
|
||||
"""
|
||||
os.makedirs(model_dir, exist_ok=True)
|
||||
|
||||
# 保存每个意图的HMM模型
|
||||
model_paths = {}
|
||||
for class_name, model in self.intent_models.items():
|
||||
model_path = os.path.join(model_dir, f"{model_name}_{class_name}.pkl")
|
||||
with open(model_path, "wb") as f:
|
||||
pickle.dump(model, f)
|
||||
model_paths[class_name] = model_path
|
||||
|
||||
# 保存label encoder和class names
|
||||
label_encoder_path = os.path.join(model_dir, f"{model_name}_label_encoder.pkl")
|
||||
with open(label_encoder_path, "wb") as f:
|
||||
pickle.dump(self.label_encoder, f)
|
||||
|
||||
class_names_path = os.path.join(model_dir, f"{model_name}_class_names.json")
|
||||
with open(class_names_path, "w") as f:
|
||||
json.dump(self.class_names, f)
|
||||
|
||||
# 保存scaler
|
||||
scaler_path = os.path.join(model_dir, f"{model_name}_scaler.pkl")
|
||||
with open(scaler_path, "wb") as f:
|
||||
pickle.dump(self.scaler, f)
|
||||
|
||||
print(f"💾 模型已保存到: {model_dir}")
|
||||
return {"intent_models": model_paths, "label_encoder": label_encoder_path, "class_names": class_names_path, "scaler": scaler_path}
|
||||
|
||||
def load_model(self, model_dir: str, model_name: str = "enhanced_dag_hmm_v2_classifier") -> None:
|
||||
"""
|
||||
加载模型
|
||||
|
||||
参数:
|
||||
model_dir: 模型目录
|
||||
model_name: 模型名称
|
||||
"""
|
||||
# 加载label encoder和class names
|
||||
label_encoder_path = os.path.join(model_dir, f"{model_name}_label_encoder.pkl")
|
||||
if not os.path.exists(label_encoder_path):
|
||||
raise FileNotFoundError(f"Label encoder文件不存在: {label_encoder_path}")
|
||||
with open(label_encoder_path, "rb") as f:
|
||||
self.label_encoder = pickle.load(f)
|
||||
self.class_names = list(self.label_encoder.classes_)
|
||||
|
||||
# 加载scaler
|
||||
scaler_path = os.path.join(model_dir, f"{model_name}_scaler.pkl")
|
||||
if not os.path.exists(scaler_path):
|
||||
raise FileNotFoundError(f"Scaler文件不存在: {scaler_path}")
|
||||
with open(scaler_path, "rb") as f:
|
||||
self.scaler = pickle.load(f)
|
||||
|
||||
# 加载每个意图的HMM模型
|
||||
self.intent_models = {}
|
||||
for class_name in self.class_names:
|
||||
model_path = os.path.join(model_dir, f"{model_name}_{class_name}.pkl")
|
||||
if not os.path.exists(model_path):
|
||||
print(f"⚠️ 意图 \'{class_name}\' 的模型文件不存在: {model_path},跳过加载。")
|
||||
continue
|
||||
with open(model_path, "rb") as f:
|
||||
model = pickle.load(f)
|
||||
# 修复加载模型的转移矩阵和初始概率
|
||||
model = self._fix_transition_matrix(model, model_name=f"加载的 {class_name} 模型")
|
||||
model = self._fix_startprob(model, model_name=f"加载的 {class_name} 模型")
|
||||
self.intent_models[class_name] = model
|
||||
|
||||
self.is_trained = True
|
||||
print(f"📂 模型已从 {model_dir} 加载")
|
||||
Reference in New Issue
Block a user