feat: first commit

2025-10-08 20:39:09 +08:00
commit 80f0e7f8d7
82 changed files with 12216 additions and 0 deletions
--- a/optimized_main.py
+++ b/optimized_main.py
@@ -0,0 +1,459 @@
+"""
+主程序 - 优化后的猫咪翻译器V2系统入口
+"""
+
+import os
+import sys
+import argparse
+import numpy as np
+import librosa
+import sounddevice as sd
+import time
+import json
+from typing import Dict, Any, List, Optional, Tuple
+
+from src.audio_input import AudioInput
+from src.hybrid_feature_extractor import HybridFeatureExtractor
+from src.dag_hmm_classifier_v2 import DAGHMMClassifierV2
+from src.cat_sound_detector import CatSoundDetector
+from src.sample_collector import SampleCollector
+from src.statistical_silence_detector import StatisticalSilenceDetector
+
+class OptimizedCatTranslator:
+    """
+    优化后的猫咪翻译器
+    
+    集成了时序调制特征、统计静音检测、混合特征提取、
+    调整梅尔滤波器数量以及DAG-HMM与优化特征结合的系统。
+    """
+    
+    def __init__(self, 
+                 detector_model_path: Optional[str] = "./models/cat_detector_svm.pkl",
+                 intent_model_path: Optional[str] = "./models",
+                 feature_type: str = "hybrid",
+                 detector_threshold: float = 0.5):
+        """
+        初始化优化后的猫咪翻译器
+
+        参数:
+            detector_model_path: 猫叫声检测器模型路径
+            intent_model_path: 意图分类器模型路径
+            feature_type: 特征类型，可选"temporal_modulation", "mfcc", "yamnet", "hybrid"
+            detector_threshold: 叫声检测阈值
+        """
+        self.audio_input = AudioInput()
+        self.feature_extractor = HybridFeatureExtractor()
+        self.detector_threshold = detector_threshold
+        self.feature_type = feature_type
+        self.species_labels = {
+            0: "none",
+            1: "cat",
+            2: "dog",
+            3: "pig",
+        }
+
+        # 加载猫叫声检测器
+        if detector_model_path and os.path.exists(detector_model_path):
+            self.cat_detector = CatSoundDetector()
+            self.cat_detector.load_model(detector_model_path)
+            print(f"猫叫声检测器已从 {detector_model_path} 加载")
+        else:
+            self.cat_detector = None
+            print("未加载猫叫声检测器，将使用YAMNet进行检测")
+
+        # 加载意图分类器
+        if intent_model_path and os.path.exists(intent_model_path):
+            self.intent_classifier = DAGHMMClassifierV2(feature_type=feature_type)
+            self.intent_classifier.load_model(intent_model_path)
+            print(f"意图分类器已从 {intent_model_path} 加载")
+        else:
+            self.intent_classifier = None
+            print("未加载意图分类器，将只进行猫叫声检测")
+
+    def analyze_file(self, file_path: str) -> Dict[str, Any]:
+        """
+        分析音频文件
+
+        参数:
+            file_path: 音频文件路径
+
+        返回:
+            result: 分析结果
+        """
+        print(f"分析音频文件: {file_path}")
+
+        # 加载音频
+        audio, sr = self.audio_input.load_from_file(file_path)
+
+        # 分析音频
+        return self.analyze_audio(audio, sr)
+
+    def analyze_audio(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, Any]:
+        """
+        分析音频数据
+
+        参数:
+            audio: 音频数据
+            sr: 采样率
+
+        返回:
+            result: 分析结果
+        """
+        # 1. 提取混合特征
+        # hybrid_features = self.feature_extractor.extract_hybrid_features(audio)
+
+        # 2. 检测物种叫声
+        if self.cat_detector:
+            # 使用优化后的物种叫声检测器
+            detector_result = self.cat_detector.predict(audio)
+            confidence = detector_result["prob"]
+            is_species_sound = detector_result["pred"] != 0 and confidence  > self.detector_threshold
+        else:
+            # 使用YAMNet检测
+            raise ValueError("未初始化物种叫声检测器")
+        species_labels = self.species_labels[detector_result["pred"]]
+
+        # 3. 如果是猫叫声，进行意图分类
+        intent_result = None
+        if is_species_sound and self.intent_classifier:
+            intent_result = self.intent_classifier.predict(audio, species_labels)
+
+        # 4. 构建结果
+        result = {
+            "species_labels": species_labels,
+            "is_species_sound": bool(is_species_sound),
+            "confidence": float(confidence),
+            "intent_result": intent_result
+        }
+
+        return result
+
+    def start_live_analysis(self,
+                           duration: float = 3.0,
+                           interval: float = 1.0,
+                           device: Optional[int] = None):
+        """
+        开始实时分析
+
+        参数:
+            duration: 每次录音持续时间（秒）
+            interval: 分析间隔时间（秒）
+            device: 录音设备ID
+        """
+        print(f"开始实时分析，按Ctrl+C停止...")
+        print(f"录音持续时间: {duration}秒，分析间隔: {interval}秒")
+
+        try:
+            while True:
+                # 录音
+                print("\n录音中...")
+                audio = self.audio_input.record_audio(duration=duration, device=device)
+
+                # 分析
+                result = self.analyze_audio(audio)
+
+                # 输出结果
+                if result["is_cat_sound"]:
+                    print(f"检测到猫叫声! 置信度: {result['confidence']:.4f}")
+                    if result["intent_result"]:
+                        intent = result["intent_result"]
+                        print(f"意图: {intent['class_name']} (置信度: {intent['confidence']:.4f})")
+                        print("所有类别概率:")
+                        for cls, prob in intent["probabilities"].items():
+                            print(f"  {cls}: {prob:.4f}")
+                else:
+                    print(f"未检测到猫叫声。置信度: {result['confidence']:.4f}")
+
+                # 等待
+                time.sleep(interval)
+
+        except KeyboardInterrupt:
+            print("\n实时分析已停止")
+
+    def add_sample(self,
+                  file_path: str,
+                  label: str,
+                  is_cat_sound: bool = True,
+                  cat_name: Optional[str] = None) -> Dict[str, Any]:
+        """
+        添加训练样本
+
+        参数:
+            file_path: 音频文件路径
+            label: 标签
+            is_cat_sound: 是否为猫叫声
+            cat_name: 猫咪名称
+
+        返回:
+            result: 添加结果
+        """
+        print(f"添加样本: {file_path}, 标签: {label}, 是否猫叫声: {is_cat_sound}")
+
+        # 加载音频
+        audio, sr = self.audio_input.load_from_file(file_path)
+
+        # 提取特征
+        hybrid_features = self.feature_extractor.extract_hybrid_features(audio)
+
+        # 保存样本
+        samples_dir = os.path.join("samples", cat_name if cat_name else "default")
+        os.makedirs(samples_dir, exist_ok=True)
+
+        # 生成样本ID
+        sample_id = int(time.time())
+
+        # 保存特征和元数据
+        sample_data = {
+            "features": hybrid_features.tolist(),
+            "label": label,
+            "is_cat_sound": is_cat_sound,
+            "cat_name": cat_name,
+            "file_path": file_path,
+            "timestamp": sample_id
+        }
+
+        sample_path = os.path.join(samples_dir, f"sample_{sample_id}.json")
+        with open(sample_path, "w") as f:
+            json.dump(sample_data, f)
+
+        print(f"样本已保存到 {sample_path}")
+
+        return {
+            "sample_id": sample_id,
+            "sample_path": sample_path
+        }
+
+    def train_detector(self,
+                      model_type: str = "svm",
+                      output_path: Optional[str] = None) -> Dict[str, Any]:
+        """
+        训练猫叫声检测器
+
+        参数:
+            model_type: 模型类型，可选"svm", "rf", "nn"
+            output_path: 输出路径
+
+        返回:
+            metrics: 训练指标
+        """
+        print(f"训练物种叫声检测器，模型类型: {model_type}")
+
+        species_sounds_audio = {
+            "cat_sounds": [],
+            "dog_sounds": [],
+            "non_sounds": [],
+        }
+        collector = SampleCollector()
+
+        for species_sounds in species_sounds_audio:
+            [
+                species_sounds_audio[species_sounds].append(librosa.load(file_path, sr=16000)[0]) for file_path in
+                [meta["target_path"] for _, meta in collector.metadata[species_sounds].items()]
+            ]
+
+
+        # 获取样本数量
+        sample_counts = collector.get_sample_counts()
+        print(f"猫叫声样本数量: {sample_counts['cat_sounds']}")
+        print(f"狗叫声样本数量: {sample_counts['dog_sounds']}")
+        print(f"非物种叫声样本数量: {sample_counts['non_sounds']}")
+
+        # 初始化检测器
+        detector = CatSoundDetector(model_type=model_type)
+
+        # 准备训练数据
+
+        # 训练模型
+        metrics = detector.train(species_sounds_audio, validation_split=0.2)
+
+        # 输出评估指标
+        print("\n评估指标:")
+        print(f"训练集准确率: {metrics['train_accuracy']:.4f}")
+        # print(f"训练集精确率: {metrics['train_precision']:.4f}")
+        # print(f"训练集召回率: {metrics['train_recall']:.4f}")
+        # print(f"训练集F1得分: {metrics['train_f1']:.4f}")
+        print(f"测试集准确率: {metrics['val_accuracy']:.4f}")
+        print(f"测试集精确率: {metrics['val_precision']:.4f}")
+        print(f"测试集召回率: {metrics['val_recall']:.4f}")
+        print(f"测试集F1得分: {metrics['val_f1']:.4f}")
+
+        # 保存模型
+        model_path = os.path.join(output_path, f"cat_detector_{model_type}.pkl")
+        detector.save_model(model_path)
+        print(f"模型已保存到: {model_path}")
+
+        return metrics
+
+    def train_intent_classifier(self,
+                               samples_dir: str,
+                               feature_type: str = "hybrid",
+                               output_path: Optional[str] = None) -> Dict[str, Any]:
+        """
+        训练意图分类器
+
+        参数:
+            samples_dir: 样本目录
+            feature_type: 特征类型，可选"temporal_modulation", "mfcc", "yamnet", "hybrid"
+            output_path: 输出路径
+
+        返回:
+            metrics: 训练指标
+        """
+        print(f"训练意图分类器，特征类型: {feature_type}")
+
+        # 加载样本
+        audio_files = []
+        labels = []
+        # 遍历样本目录下的所有子目录（每个子目录对应一个意图类别）
+        for intent_dir in os.listdir(samples_dir):
+            intent_path = os.path.join(samples_dir, intent_dir)
+            if os.path.isdir(intent_path):
+                for file in os.listdir(intent_path):
+                    if file.endswith(".wav") or file.endswith(".WAV") or file.endswith(".mp3"):
+                        audio_path = os.path.join(intent_path, file)
+                        audio, sr = librosa.load(audio_path, sr=16000)
+                        if audio.size > 0: # 确保音频数据不为空
+                            audio_files.append(audio)
+                            labels.append(intent_dir)
+                        else:
+                            print(f"警告: 音频文件 {audio_path} 为空，跳过。")
+
+        print(f"加载了 {len(audio_files)} 个样本，共 {len(set(labels))} 个意图类别")
+
+        if not audio_files or len(set(labels)) < 2: # 至少需要两个类别才能训练分类器
+            print("错误: 训练意图分类器所需样本或类别不足，跳过训练。")
+            return {"train_accuracy": float("nan"), "message": "样本或类别不足"}
+
+        # 初始化分类器
+        classifier = DAGHMMClassifierV2(feature_type=feature_type)
+
+        # 训练模型
+        metrics = classifier.fit(audio_files, labels)
+
+        # 保存模型
+        if output_path:
+            classifier.save_model(output_path)
+            print(f"模型已保存到 {output_path}")
+
+        return metrics
+
+
+def main():
+    parser = argparse.ArgumentParser(description="优化后的猫咪翻译器V2")
+
+    # 子命令
+    subparsers = parser.add_subparsers(dest="command", help="命令")
+
+    # 分析命令
+    analyze_parser = subparsers.add_parser("analyze", help="分析音频文件")
+    analyze_parser.add_argument("file", help="音频文件路径")
+    analyze_parser.add_argument("--detector", help="猫叫声检测器模型路径", default="./models/cat_detector_svm.pkl")
+    analyze_parser.add_argument("--intent-model", help="意图分类器模型路径", default="./models")
+    analyze_parser.add_argument("--feature-type", default="hybrid",
+                               choices=["temporal_modulation", "mfcc", "yamnet", "hybrid"],
+                               help="特征类型")
+    analyze_parser.add_argument("--threshold", type=float, default=0.5, help="猫叫声检测阈值")
+
+    # 实时分析命令
+    live_parser = subparsers.add_parser("live", help="实时分析麦克风输入")
+    live_parser.add_argument("--detector", help="猫叫声检测器模型路径", default="./models/cat_detector_svm.pkl")
+    live_parser.add_argument("--intent-model", help="意图分类器模型路径", default="./models")
+    live_parser.add_argument("--feature-type", default="temporal_modulation",
+                            choices=["temporal_modulation", "mfcc", "yamnet", "hybrid"],
+                            help="特征类型")
+    live_parser.add_argument("--threshold", type=float, default=0.5, help="猫叫声检测阈值")
+    live_parser.add_argument("--duration", type=float, default=3.0, help="每次录音持续时间（秒）")
+    live_parser.add_argument("--interval", type=float, default=1.0, help="分析间隔时间（秒）")
+    live_parser.add_argument("--device", type=int, help="录音设备ID")
+
+    # 添加样本命令
+    add_sample_parser = subparsers.add_parser("add-sample", help="添加训练样本")
+    add_sample_parser.add_argument("file", help="音频文件路径")
+    add_sample_parser.add_argument("label", help="标签")
+    add_sample_parser.add_argument("--is-cat-sound", action="store_true", help="是否为猫叫声")
+    add_sample_parser.add_argument("--cat", help="猫咪名称")
+
+    # 训练检测器命令
+    train_detector_parser = subparsers.add_parser("train-detector", help="训练猫叫声检测器")
+    train_detector_parser.add_argument("--model-type", default="svm", choices=["svm", "rf", "nn"], help="模型类型")
+    train_detector_parser.add_argument("--output", default="./models", help="输出路径")
+
+    # 训练意图分类器命令
+    train_intent_parser = subparsers.add_parser("train-intent", help="训练意图分类器")
+    train_intent_parser.add_argument("--samples", required=True, help="样本目录")
+    train_intent_parser.add_argument("--feature-type", default="hybrid",
+                                    choices=["temporal_modulation", "mfcc", "yamnet", "hybrid"],
+                                    help="特征类型")
+    train_intent_parser.add_argument("--output", help="输出路径")
+
+    args = parser.parse_args()
+
+    if args.command == "analyze":
+        translator = OptimizedCatTranslator(
+            detector_model_path=args.detector,
+            intent_model_path=args.intent_model,
+            feature_type=args.feature_type,
+            detector_threshold=args.threshold
+        )
+        result = translator.analyze_file(args.file)
+
+        # 输出结果
+        if result["is_species_sound"]:
+            print(f"检测到 {result['species_labels']} 叫声! 置信度: {result['confidence']:.4f}")
+            if result["intent_result"]:
+                intent = result["intent_result"]
+                if intent['winner']:
+                    print(f"意图: {intent['winner']} (置信度: {intent['confidence']:.4f})")
+                else:
+                    print("⚠️特征学习中。。。")
+                    print(intent)
+
+        else:
+            print(f"未检测到物种叫声。置信度: {result['confidence']:.4f}")
+
+    elif args.command == "live":
+        translator = OptimizedCatTranslator(
+            detector_model_path=args.detector,
+            intent_model_path=args.intent_model,
+            feature_type=args.feature_type,
+            detector_threshold=args.threshold
+        )
+        translator.start_live_analysis(
+            duration=args.duration,
+            interval=args.interval,
+            device=args.device
+        )
+
+    elif args.command == "add-sample":
+        translator = OptimizedCatTranslator()
+        result = translator.add_sample(
+            file_path=args.file,
+            label=args.label,
+            is_cat_sound=args.is_cat_sound,
+            cat_name=args.cat
+        )
+        print(f"样本已添加，ID: {result['sample_id']}")
+
+    elif args.command == "train-detector":
+        translator = OptimizedCatTranslator()
+        metrics = translator.train_detector(
+            model_type=args.model_type,
+            output_path=args.output
+        )
+        print(f"训练完成")
+
+    elif args.command == "train-intent":
+        translator = OptimizedCatTranslator()
+        metrics = translator.train_intent_classifier(
+            samples_dir=args.samples,
+            feature_type=args.feature_type,
+            output_path=args.output
+        )
+        print(f"训练完成")
+
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()