feat: 完整的语音助手系统实现
主要功能: - ✅ 离线语音识别 (ASR) - Paraformer中文模型 - ✅ 在线语音识别 - Streaming Paraformer中英文双语模型 - ✅ 语音合成 (TTS) - MeloTTS中英文混合模型 - ✅ 语音唤醒 (KWS) - Zipformer关键词检测模型 - ✅ 麦克风录音功能 - 支持多种格式和实时转换 - ✅ 模型设置界面 - 完整的图形化配置管理 KWS优化亮点: - 🎯 成功实现关键词检测 (测试成功率10%→预期50%+) - ⚙️ 可调参数: 阈值、活跃路径、尾随空白、分数权重、线程数 - 🔧 智能参数验证和实时反馈 - 📊 详细的调试信息和成功统计 - 🎛️ 用户友好的设置界面 技术架构: - 模块化设计: ASRManager, TTSManager, KWSManager - 实时音频处理: 自动格式转换 (任意格式→16kHz单声道) - 智能设备检测: 自动选择最佳音频格式 - 完整资源管理: 正确的创建和销毁流程 - 跨平台支持: macOS优化的音频权限处理 界面特性: - 2×2网格布局: ASR、TTS、录音、KWS四大功能模块 - 分离录音设置: 设备参数 + 输出格式独立配置 - 实时状态显示: 音频电平、处理次数、成功统计 - 详细的用户指导和错误提示
This commit is contained in:
241
ASRManager.cpp
Normal file
241
ASRManager.cpp
Normal file
@@ -0,0 +1,241 @@
|
||||
#include "ASRManager.h"
|
||||
#include <QDir>
|
||||
#include <QFile>
|
||||
#include <QDebug>
|
||||
#include <QIODevice>
|
||||
#include <vector>
|
||||
|
||||
ASRManager::ASRManager(QObject* parent) : QObject(parent) {
|
||||
}
|
||||
|
||||
ASRManager::~ASRManager() {
|
||||
cleanup();
|
||||
}
|
||||
|
||||
bool ASRManager::initialize() {
|
||||
// 初始化ASR模型
|
||||
QString dataPath = QDir::homePath() + "/.config/QSmartAssistant/Data/";
|
||||
QString asrModelPath = dataPath + "sherpa-onnx-paraformer-zh-2024-03-09/model.int8.onnx";
|
||||
QString asrTokensPath = dataPath + "sherpa-onnx-paraformer-zh-2024-03-09/tokens.txt";
|
||||
|
||||
memset(&asrConfig, 0, sizeof(asrConfig));
|
||||
asrConfig.feat_config.feature_dim = 80;
|
||||
asrConfig.feat_config.sample_rate = 16000;
|
||||
asrConfig.model_config.num_threads = 2;
|
||||
asrConfig.model_config.provider = "cpu";
|
||||
asrConfig.max_active_paths = 4;
|
||||
asrConfig.decoding_method = "greedy_search";
|
||||
|
||||
asrModelPathStd = asrModelPath.toStdString();
|
||||
asrTokensPathStd = asrTokensPath.toStdString();
|
||||
asrConfig.model_config.tokens = asrTokensPathStd.c_str();
|
||||
asrConfig.model_config.paraformer.model = asrModelPathStd.c_str();
|
||||
|
||||
asrRecognizer = const_cast<SherpaOnnxOfflineRecognizer*>(
|
||||
SherpaOnnxCreateOfflineRecognizer(&asrConfig));
|
||||
|
||||
qDebug() << "离线ASR识别器:" << (asrRecognizer ? "成功" : "失败");
|
||||
return asrRecognizer != nullptr;
|
||||
}
|
||||
|
||||
bool ASRManager::initializeOnlineRecognizer() {
|
||||
// 初始化在线识别器,使用streaming-paraformer-bilingual模型
|
||||
QString dataPath = QDir::homePath() + "/.config/QSmartAssistant/Data/";
|
||||
QString onlineEncoderPath = dataPath + "sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx";
|
||||
QString onlineDecoderPath = dataPath + "sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx";
|
||||
QString onlineTokensPath = dataPath + "sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt";
|
||||
|
||||
// 检查文件是否存在
|
||||
if (!QFile::exists(onlineEncoderPath) || !QFile::exists(onlineDecoderPath) || !QFile::exists(onlineTokensPath)) {
|
||||
qDebug() << "在线模型文件不存在,跳过在线识别器初始化";
|
||||
return false;
|
||||
}
|
||||
|
||||
memset(&onlineAsrConfig, 0, sizeof(onlineAsrConfig));
|
||||
|
||||
// 特征配置
|
||||
onlineAsrConfig.feat_config.sample_rate = 16000;
|
||||
onlineAsrConfig.feat_config.feature_dim = 80;
|
||||
|
||||
// 模型配置
|
||||
onlineAsrConfig.model_config.num_threads = 2;
|
||||
onlineAsrConfig.model_config.provider = "cpu";
|
||||
onlineAsrConfig.model_config.debug = 0;
|
||||
|
||||
// Paraformer配置
|
||||
onlineEncoderPathStd = onlineEncoderPath.toStdString();
|
||||
onlineDecoderPathStd = onlineDecoderPath.toStdString();
|
||||
onlineTokensPathStd = onlineTokensPath.toStdString();
|
||||
|
||||
onlineAsrConfig.model_config.paraformer.encoder = onlineEncoderPathStd.c_str();
|
||||
onlineAsrConfig.model_config.paraformer.decoder = onlineDecoderPathStd.c_str();
|
||||
onlineAsrConfig.model_config.tokens = onlineTokensPathStd.c_str();
|
||||
|
||||
// 解码配置
|
||||
onlineAsrConfig.decoding_method = "greedy_search";
|
||||
onlineAsrConfig.max_active_paths = 4;
|
||||
|
||||
// 端点检测配置
|
||||
onlineAsrConfig.enable_endpoint = 1;
|
||||
onlineAsrConfig.rule1_min_trailing_silence = 2.4f;
|
||||
onlineAsrConfig.rule2_min_trailing_silence = 1.2f;
|
||||
onlineAsrConfig.rule3_min_utterance_length = 20.0f;
|
||||
|
||||
onlineAsrRecognizer = const_cast<SherpaOnnxOnlineRecognizer*>(
|
||||
SherpaOnnxCreateOnlineRecognizer(&onlineAsrConfig));
|
||||
|
||||
qDebug() << "在线ASR识别器:" << (onlineAsrRecognizer ? "成功" : "失败");
|
||||
if (onlineAsrRecognizer) {
|
||||
qDebug() << "使用模型: sherpa-onnx-streaming-paraformer-bilingual-zh-en";
|
||||
}
|
||||
|
||||
return onlineAsrRecognizer != nullptr;
|
||||
}
|
||||
|
||||
QString ASRManager::recognizeWavFile(const QString& filePath) {
|
||||
if (!asrRecognizer) {
|
||||
return "ASR模型未初始化";
|
||||
}
|
||||
|
||||
QFile file(filePath);
|
||||
if (!file.open(QIODevice::ReadOnly)) {
|
||||
return "无法打开文件";
|
||||
}
|
||||
|
||||
// 跳过WAV头部(44字节)
|
||||
QByteArray header = file.read(44);
|
||||
if (header.size() < 44) {
|
||||
return "无效的WAV文件";
|
||||
}
|
||||
|
||||
// 读取音频数据
|
||||
QByteArray audioData = file.readAll();
|
||||
file.close();
|
||||
|
||||
// 创建音频流
|
||||
const SherpaOnnxOfflineStream* stream = SherpaOnnxCreateOfflineStream(asrRecognizer);
|
||||
|
||||
// 转换音频数据
|
||||
const int16_t* intData = reinterpret_cast<const int16_t*>(audioData.data());
|
||||
int dataLength = audioData.length() / 2;
|
||||
|
||||
std::vector<float> samples(16000);
|
||||
int currentPos = 0;
|
||||
|
||||
while (currentPos < dataLength) {
|
||||
int currentLength = std::min(16000, dataLength - currentPos);
|
||||
|
||||
for (int i = 0; i < currentLength; i++) {
|
||||
samples[i] = intData[i + currentPos] / 32768.0f;
|
||||
}
|
||||
|
||||
SherpaOnnxAcceptWaveformOffline(stream, 16000, samples.data(), currentLength);
|
||||
currentPos += currentLength;
|
||||
}
|
||||
|
||||
// 执行识别
|
||||
SherpaOnnxDecodeOfflineStream(asrRecognizer, stream);
|
||||
|
||||
// 获取结果
|
||||
const SherpaOnnxOfflineRecognizerResult* result = SherpaOnnxGetOfflineStreamResult(stream);
|
||||
|
||||
QString recognizedText = "";
|
||||
if (result && strlen(result->text) > 0) {
|
||||
recognizedText = QString::fromUtf8(result->text);
|
||||
}
|
||||
|
||||
// 清理资源
|
||||
SherpaOnnxDestroyOfflineRecognizerResult(result);
|
||||
SherpaOnnxDestroyOfflineStream(stream);
|
||||
|
||||
return recognizedText.isEmpty() ? "[无识别结果]" : recognizedText;
|
||||
}
|
||||
|
||||
const SherpaOnnxOnlineStream* ASRManager::createOnlineStream() {
|
||||
if (!onlineAsrRecognizer) {
|
||||
return nullptr;
|
||||
}
|
||||
return SherpaOnnxCreateOnlineStream(onlineAsrRecognizer);
|
||||
}
|
||||
|
||||
void ASRManager::destroyOnlineStream(const SherpaOnnxOnlineStream* stream) {
|
||||
if (stream) {
|
||||
SherpaOnnxDestroyOnlineStream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
void ASRManager::acceptWaveform(const SherpaOnnxOnlineStream* stream, const float* samples, int32_t sampleCount) {
|
||||
if (stream && samples && sampleCount > 0) {
|
||||
SherpaOnnxOnlineStreamAcceptWaveform(stream, 16000, samples, sampleCount);
|
||||
|
||||
static int totalSamples = 0;
|
||||
totalSamples += sampleCount;
|
||||
|
||||
// 每处理1秒的音频数据输出一次调试信息
|
||||
if (totalSamples % 16000 == 0) {
|
||||
qDebug() << "ASR已处理音频:" << (totalSamples / 16000) << "秒";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool ASRManager::isStreamReady(const SherpaOnnxOnlineStream* stream) {
|
||||
if (!onlineAsrRecognizer || !stream) {
|
||||
return false;
|
||||
}
|
||||
return SherpaOnnxIsOnlineStreamReady(onlineAsrRecognizer, stream) == 1;
|
||||
}
|
||||
|
||||
void ASRManager::decodeStream(const SherpaOnnxOnlineStream* stream) {
|
||||
if (onlineAsrRecognizer && stream) {
|
||||
SherpaOnnxDecodeOnlineStream(onlineAsrRecognizer, stream);
|
||||
}
|
||||
}
|
||||
|
||||
QString ASRManager::getStreamResult(const SherpaOnnxOnlineStream* stream) {
|
||||
if (!onlineAsrRecognizer || !stream) {
|
||||
return "";
|
||||
}
|
||||
|
||||
const SherpaOnnxOnlineRecognizerResult* result =
|
||||
SherpaOnnxGetOnlineStreamResult(onlineAsrRecognizer, stream);
|
||||
|
||||
QString text = "";
|
||||
if (result) {
|
||||
if (strlen(result->text) > 0) {
|
||||
text = QString::fromUtf8(result->text);
|
||||
qDebug() << "ASR识别结果:" << text;
|
||||
}
|
||||
SherpaOnnxDestroyOnlineRecognizerResult(result);
|
||||
} else {
|
||||
qDebug() << "ASR识别结果为空";
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
void ASRManager::inputFinished(const SherpaOnnxOnlineStream* stream) {
|
||||
if (stream) {
|
||||
SherpaOnnxOnlineStreamInputFinished(stream);
|
||||
}
|
||||
}
|
||||
|
||||
bool ASRManager::isEndpoint(const SherpaOnnxOnlineStream* stream) {
|
||||
if (!onlineAsrRecognizer || !stream) {
|
||||
return false;
|
||||
}
|
||||
return SherpaOnnxOnlineStreamIsEndpoint(onlineAsrRecognizer, stream) == 1;
|
||||
}
|
||||
|
||||
void ASRManager::cleanup() {
|
||||
// 清理离线识别器
|
||||
if (asrRecognizer) {
|
||||
SherpaOnnxDestroyOfflineRecognizer(asrRecognizer);
|
||||
asrRecognizer = nullptr;
|
||||
}
|
||||
|
||||
// 清理在线识别器
|
||||
if (onlineAsrRecognizer) {
|
||||
SherpaOnnxDestroyOnlineRecognizer(onlineAsrRecognizer);
|
||||
onlineAsrRecognizer = nullptr;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user