Files
QSmartAssistant/SpeechTestMainWindow.cpp
lizhuoran e92cb0b4e5 feat: 完整的语音助手系统实现
主要功能:
-  离线语音识别 (ASR) - Paraformer中文模型
-  在线语音识别 - Streaming Paraformer中英文双语模型
-  语音合成 (TTS) - MeloTTS中英文混合模型
-  语音唤醒 (KWS) - Zipformer关键词检测模型
-  麦克风录音功能 - 支持多种格式和实时转换
-  模型设置界面 - 完整的图形化配置管理

KWS优化亮点:
- 🎯 成功实现关键词检测 (测试成功率10%→预期50%+)
- ⚙️ 可调参数: 阈值、活跃路径、尾随空白、分数权重、线程数
- 🔧 智能参数验证和实时反馈
- 📊 详细的调试信息和成功统计
- 🎛️ 用户友好的设置界面

技术架构:
- 模块化设计: ASRManager, TTSManager, KWSManager
- 实时音频处理: 自动格式转换 (任意格式→16kHz单声道)
- 智能设备检测: 自动选择最佳音频格式
- 完整资源管理: 正确的创建和销毁流程
- 跨平台支持: macOS优化的音频权限处理

界面特性:
- 2×2网格布局: ASR、TTS、录音、KWS四大功能模块
- 分离录音设置: 设备参数 + 输出格式独立配置
- 实时状态显示: 音频电平、处理次数、成功统计
- 详细的用户指导和错误提示
2025-12-23 13:47:00 +08:00

1854 lines
71 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "SpeechTestMainWindow.h"
#include <QApplication>
#include <QFileDialog>
#include <QMessageBox>
#include <QDebug>
#include <QDir>
#include <QDateTime>
#include <QProcess>
#include <QMediaDevices>
#include <QAudioFormat>
#include <QMenuBar>
#include <QAction>
#include <QCheckBox>
#include <QMenu>
#include <QGridLayout>
SpeechTestMainWindow::SpeechTestMainWindow(QWidget* parent) : QMainWindow(parent) {
// 创建管理器
asrManager = new ASRManager(this);
ttsManager = new TTSManager(this);
kwsManager = new KWSManager(this);
setupUI();
setupMenuBar();
createOutputDirectories();
connectSignals();
// 初始化模型
bool asrOk = asrManager->initialize();
bool ttsOk = ttsManager->initialize();
bool kwsOk = kwsManager->initialize();
// 尝试初始化在线识别器(当前会失败)
asrManager->initializeOnlineRecognizer();
setWindowTitle("QSmartAssistant 语音测试工具");
setMinimumSize(1200, 800); // 增加最小尺寸以适应网格布局
resize(1400, 900); // 增加默认尺寸
// 根据在线识别器状态更新麦克风按钮
if (asrManager->isOnlineInitialized()) {
micRecordBtn->setEnabled(true);
micRecordBtn->setText("开始麦克风识别");
micRecordBtn->setStyleSheet("QPushButton { background-color: #FF5722; color: white; font-weight: bold; }");
} else {
micRecordBtn->setEnabled(false);
micRecordBtn->setText("麦克风识别(模型未加载)");
micRecordBtn->setStyleSheet("QPushButton { background-color: #9E9E9E; color: white; font-weight: bold; }");
}
// 更新状态栏
if (asrOk && ttsOk && kwsOk) {
QString modelInfo = ttsManager->isMultilingualModel() ? "(支持中英文混合)" : "(仅支持中文)";
QString micInfo = asrManager->isOnlineInitialized() ? ",支持实时识别" : ",麦克风识别不可用";
QString kwsInfo = kwsOk ? ",支持语音唤醒" : ",语音唤醒不可用";
statusBar()->showMessage("模型初始化成功,就绪 " + modelInfo + micInfo + kwsInfo);
} else {
statusBar()->showMessage("模型初始化失败");
if (!asrOk) qDebug() << "离线ASR初始化失败";
if (!ttsOk) qDebug() << "TTS初始化失败";
if (!kwsOk) qDebug() << "KWS初始化失败";
}
}
SpeechTestMainWindow::~SpeechTestMainWindow() {
// 停止麦克风识别
if (isRecording) {
stopMicRecognition();
}
// 停止录音
if (isRecordingWav) {
stopRecording();
}
// 清理音频输入
if (audioSource) {
audioSource->stop();
delete audioSource;
audioSource = nullptr;
}
if (audioTimer) {
audioTimer->stop();
delete audioTimer;
audioTimer = nullptr;
}
// 清理录音资源
if (recordAudioSource) {
recordAudioSource->stop();
delete recordAudioSource;
recordAudioSource = nullptr;
}
if (recordTimer) {
recordTimer->stop();
delete recordTimer;
recordTimer = nullptr;
}
// 清理语音唤醒资源
if (kwsAudioSource) {
kwsAudioSource->stop();
delete kwsAudioSource;
kwsAudioSource = nullptr;
}
if (kwsTimer) {
kwsTimer->stop();
delete kwsTimer;
kwsTimer = nullptr;
}
}
void SpeechTestMainWindow::setupUI() {
auto* centralWidget = new QWidget(this);
setCentralWidget(centralWidget);
auto* mainLayout = new QVBoxLayout(centralWidget);
// 创建网格布局(两行两列)
auto* gridLayout = new QGridLayout();
gridLayout->setSpacing(10);
gridLayout->setContentsMargins(10, 10, 10, 10);
// 设置行列拉伸策略,让各模块均匀分配空间
gridLayout->setRowStretch(0, 1); // 第一行拉伸因子为1
gridLayout->setRowStretch(1, 1); // 第二行拉伸因子为1
gridLayout->setColumnStretch(0, 1); // 第一列拉伸因子为1
gridLayout->setColumnStretch(1, 1); // 第二列拉伸因子为1
// 创建一个容器widget来包含网格布局
auto* gridWidget = new QWidget(this);
gridWidget->setLayout(gridLayout);
mainLayout->addWidget(gridWidget);
// ASR部分
auto* asrGroup = new QGroupBox("语音识别 (ASR)", this);
auto* asrLayout = new QVBoxLayout(asrGroup);
// 文件选择
auto* fileLayout = new QHBoxLayout();
filePathEdit = new QLineEdit(this);
filePathEdit->setPlaceholderText("选择WAV音频文件...");
auto* browseBtn = new QPushButton("浏览", this);
browseBtn->setObjectName("browseBtn");
auto* recognizeBtn = new QPushButton("开始识别", this);
recognizeBtn->setObjectName("recognizeBtn");
recognizeBtn->setStyleSheet("QPushButton { background-color: #4CAF50; color: white; font-weight: bold; }");
fileLayout->addWidget(new QLabel("音频文件:", this));
fileLayout->addWidget(filePathEdit, 1);
fileLayout->addWidget(browseBtn);
fileLayout->addWidget(recognizeBtn);
asrLayout->addLayout(fileLayout);
// 麦克风识别控件
auto* micLayout = new QHBoxLayout();
micRecordBtn = new QPushButton("开始麦克风识别", this);
micRecordBtn->setStyleSheet("QPushButton { background-color: #FF5722; color: white; font-weight: bold; }");
micStopBtn = new QPushButton("停止识别", this);
micStopBtn->setStyleSheet("QPushButton { background-color: #9E9E9E; color: white; font-weight: bold; }");
micStopBtn->setEnabled(false);
micLayout->addWidget(new QLabel("实时识别:", this));
micLayout->addStretch();
micLayout->addWidget(micRecordBtn);
micLayout->addWidget(micStopBtn);
asrLayout->addLayout(micLayout);
// 识别结果
asrResultEdit = new QTextEdit(this);
asrResultEdit->setPlaceholderText("识别结果将显示在这里...");
asrResultEdit->setMinimumHeight(100);
asrResultEdit->setMaximumHeight(200);
asrLayout->addWidget(new QLabel("识别结果:", this));
asrLayout->addWidget(asrResultEdit);
// 将ASR组件添加到网格布局的第一行第一列
gridLayout->addWidget(asrGroup, 0, 0);
// TTS部分
auto* ttsGroup = new QGroupBox("文字转语音 (TTS)", this);
auto* ttsLayout = new QVBoxLayout(ttsGroup);
// 文本输入
ttsTextEdit = new QTextEdit(this);
ttsTextEdit->setPlaceholderText("请输入要合成的文本(支持中英文混合)...");
ttsTextEdit->setMinimumHeight(80);
ttsTextEdit->setMaximumHeight(120);
ttsLayout->addWidget(new QLabel("输入文本:", this));
ttsLayout->addWidget(ttsTextEdit);
// TTS设置
auto* ttsSettingsLayout = new QHBoxLayout();
speakerIdSpinBox = new QSpinBox(this);
speakerIdSpinBox->setRange(0, 100);
speakerIdSpinBox->setValue(0);
auto* synthesizeBtn = new QPushButton("开始合成", this);
synthesizeBtn->setObjectName("synthesizeBtn");
synthesizeBtn->setStyleSheet("QPushButton { background-color: #2196F3; color: white; font-weight: bold; }");
ttsSettingsLayout->addWidget(new QLabel("说话人ID:", this));
ttsSettingsLayout->addWidget(speakerIdSpinBox);
ttsSettingsLayout->addStretch();
ttsSettingsLayout->addWidget(synthesizeBtn);
ttsLayout->addLayout(ttsSettingsLayout);
// TTS结果
ttsResultEdit = new QTextEdit(this);
ttsResultEdit->setPlaceholderText("合成结果将显示在这里...");
ttsResultEdit->setMinimumHeight(80);
ttsResultEdit->setMaximumHeight(120);
ttsLayout->addWidget(new QLabel("合成结果:", this));
ttsLayout->addWidget(ttsResultEdit);
// 将TTS组件添加到网格布局的第一行第二列
gridLayout->addWidget(ttsGroup, 0, 1);
// 录音功能部分
auto* recordGroup = new QGroupBox("麦克风录音", this);
auto* recordLayout = new QVBoxLayout(recordGroup);
// 录音设置区域(设备参数)
auto* recordSettingsGroup = new QGroupBox("录音设置(设备参数)", this);
auto* recordSettingsLayout = new QHBoxLayout(recordSettingsGroup);
// 录音采样率设置
recordSampleRateComboBox = new QComboBox(this);
recordSampleRateComboBox->addItem("自动检测最佳", -1);
recordSampleRateComboBox->addItem("48000 Hz (专业)", 48000);
recordSampleRateComboBox->addItem("44100 Hz (CD质量)", 44100);
recordSampleRateComboBox->addItem("22050 Hz", 22050);
recordSampleRateComboBox->addItem("16000 Hz", 16000);
recordSampleRateComboBox->setCurrentIndex(0); // 默认自动检测
recordSampleRateComboBox->setToolTip("选择录音时使用的采样率,自动检测会选择设备支持的最佳格式");
// 录音声道设置
recordChannelComboBox = new QComboBox(this);
recordChannelComboBox->addItem("自动检测最佳", -1);
recordChannelComboBox->addItem("立体声 (Stereo)", 2);
recordChannelComboBox->addItem("单声道 (Mono)", 1);
recordChannelComboBox->setCurrentIndex(0); // 默认自动检测
recordChannelComboBox->setToolTip("选择录音时使用的声道数,自动检测会选择设备支持的最佳格式");
recordSettingsLayout->addWidget(new QLabel("录音采样率:", this));
recordSettingsLayout->addWidget(recordSampleRateComboBox);
recordSettingsLayout->addWidget(new QLabel("录音声道:", this));
recordSettingsLayout->addWidget(recordChannelComboBox);
recordSettingsLayout->addStretch();
recordLayout->addWidget(recordSettingsGroup);
// 输出设置区域(保存格式)
auto* outputSettingsGroup = new QGroupBox("输出设置(保存格式)", this);
auto* outputSettingsLayout = new QHBoxLayout(outputSettingsGroup);
// 输出采样率设置
outputSampleRateComboBox = new QComboBox(this);
outputSampleRateComboBox->addItem("8000 Hz", 8000);
outputSampleRateComboBox->addItem("16000 Hz (语音识别)", 16000);
outputSampleRateComboBox->addItem("22050 Hz", 22050);
outputSampleRateComboBox->addItem("44100 Hz (CD质量)", 44100);
outputSampleRateComboBox->addItem("48000 Hz (专业)", 48000);
outputSampleRateComboBox->setCurrentIndex(1); // 默认选择16000 Hz
outputSampleRateComboBox->setToolTip("选择最终保存文件的采样率");
// 输出声道设置
outputChannelComboBox = new QComboBox(this);
outputChannelComboBox->addItem("单声道 (Mono)", 1);
outputChannelComboBox->addItem("立体声 (Stereo)", 2);
outputChannelComboBox->setCurrentIndex(0); // 默认选择单声道
outputChannelComboBox->setToolTip("选择最终保存文件的声道数");
// 添加预设配置按钮
auto* presetBtn = new QPushButton("预设", this);
presetBtn->setToolTip("选择常用输出预设配置");
presetBtn->setMaximumWidth(60);
// 连接预设按钮信号
connect(presetBtn, &QPushButton::clicked, this, [this, presetBtn]() {
QMenu* presetMenu = new QMenu(this);
QAction* voiceAction = presetMenu->addAction("🎤 语音识别 (16kHz 单声道)");
connect(voiceAction, &QAction::triggered, this, [this]() {
outputSampleRateComboBox->setCurrentIndex(1); // 16000 Hz
outputChannelComboBox->setCurrentIndex(0); // 单声道
});
QAction* musicAction = presetMenu->addAction("🎵 音乐保存 (44.1kHz 立体声)");
connect(musicAction, &QAction::triggered, this, [this]() {
outputSampleRateComboBox->setCurrentIndex(3); // 44100 Hz
outputChannelComboBox->setCurrentIndex(1); // 立体声
});
QAction* professionalAction = presetMenu->addAction("🎙️ 专业保存 (48kHz 立体声)");
connect(professionalAction, &QAction::triggered, this, [this]() {
outputSampleRateComboBox->setCurrentIndex(4); // 48000 Hz
outputChannelComboBox->setCurrentIndex(1); // 立体声
});
QAction* compactAction = presetMenu->addAction("📱 紧凑保存 (22kHz 单声道)");
connect(compactAction, &QAction::triggered, this, [this]() {
outputSampleRateComboBox->setCurrentIndex(2); // 22050 Hz
outputChannelComboBox->setCurrentIndex(0); // 单声道
});
presetMenu->exec(presetBtn->mapToGlobal(QPoint(0, presetBtn->height())));
presetMenu->deleteLater();
});
outputSettingsLayout->addWidget(new QLabel("输出采样率:", this));
outputSettingsLayout->addWidget(outputSampleRateComboBox);
outputSettingsLayout->addWidget(new QLabel("输出声道:", this));
outputSettingsLayout->addWidget(outputChannelComboBox);
outputSettingsLayout->addWidget(presetBtn);
// 添加文件大小预估标签
auto* fileSizeLabel = new QLabel(this);
fileSizeLabel->setStyleSheet("QLabel { color: #888; font-size: 10px; }");
fileSizeLabel->setObjectName("fileSizeLabel");
// 连接设置变化信号来更新文件大小预估
auto updateFileSizeEstimate = [this, fileSizeLabel]() {
int sampleRate = outputSampleRateComboBox->currentData().toInt();
int channels = outputChannelComboBox->currentData().toInt();
// 计算每秒的字节数 (采样率 × 声道数 × 2字节/样本)
int bytesPerSecond = sampleRate * channels * 2;
double mbPerMinute = (bytesPerSecond * 60.0) / (1024.0 * 1024.0);
QString sizeText = QString("预估输出文件大小: ~%1 MB/分钟").arg(mbPerMinute, 0, 'f', 1);
fileSizeLabel->setText(sizeText);
};
connect(outputSampleRateComboBox, QOverload<int>::of(&QComboBox::currentIndexChanged), updateFileSizeEstimate);
connect(outputChannelComboBox, QOverload<int>::of(&QComboBox::currentIndexChanged), updateFileSizeEstimate);
// 初始计算
updateFileSizeEstimate();
outputSettingsLayout->addWidget(fileSizeLabel);
outputSettingsLayout->addStretch();
recordLayout->addWidget(outputSettingsGroup);
// 录音控制按钮
auto* recordControlLayout = new QHBoxLayout();
recordBtn = new QPushButton("开始录音", this);
recordBtn->setStyleSheet("QPushButton { background-color: #E91E63; color: white; font-weight: bold; }");
recordStopBtn = new QPushButton("停止录音", this);
recordStopBtn->setStyleSheet("QPushButton { background-color: #9E9E9E; color: white; font-weight: bold; }");
recordStopBtn->setEnabled(false);
recordControlLayout->addWidget(new QLabel("WAV录音:", this));
recordControlLayout->addStretch();
recordControlLayout->addWidget(recordBtn);
recordControlLayout->addWidget(recordStopBtn);
recordLayout->addLayout(recordControlLayout);
// 录音结果显示
recordResultEdit = new QTextEdit(this);
recordResultEdit->setPlaceholderText("录音文件信息将显示在这里...");
recordResultEdit->setMinimumHeight(80);
recordResultEdit->setMaximumHeight(120);
recordLayout->addWidget(new QLabel("录音结果:", this));
recordLayout->addWidget(recordResultEdit);
// 将录音组件添加到网格布局的第二行第一列
gridLayout->addWidget(recordGroup, 1, 0);
// 语音唤醒功能部分
auto* kwsGroup = new QGroupBox("语音唤醒 (KWS)", this);
auto* kwsLayout = new QVBoxLayout(kwsGroup);
// 语音唤醒控制按钮
auto* kwsControlLayout = new QHBoxLayout();
kwsStartBtn = new QPushButton("开始语音唤醒", this);
kwsStartBtn->setStyleSheet("QPushButton { background-color: #9C27B0; color: white; font-weight: bold; }");
kwsStopBtn = new QPushButton("停止唤醒", this);
kwsStopBtn->setStyleSheet("QPushButton { background-color: #9E9E9E; color: white; font-weight: bold; }");
kwsStopBtn->setEnabled(false);
kwsControlLayout->addWidget(new QLabel("关键词检测:", this));
kwsControlLayout->addStretch();
kwsControlLayout->addWidget(kwsStartBtn);
kwsControlLayout->addWidget(kwsStopBtn);
kwsLayout->addLayout(kwsControlLayout);
// 语音唤醒结果显示
kwsResultEdit = new QTextEdit(this);
kwsResultEdit->setPlaceholderText("语音唤醒检测结果将显示在这里...");
kwsResultEdit->setMinimumHeight(80);
kwsResultEdit->setMaximumHeight(120);
kwsLayout->addWidget(new QLabel("唤醒结果:", this));
kwsLayout->addWidget(kwsResultEdit);
// 将语音唤醒组件添加到网格布局的第二行第二列
gridLayout->addWidget(kwsGroup, 1, 1);
// 设置一些示例文本(中英文混合)
ttsTextEdit->setPlainText("你好这是语音合成测试。Hello, this is a speech synthesis test. 今天天气很好适合出门散步。The weather is nice today.");
}
void SpeechTestMainWindow::setupMenuBar() {
// 创建菜单栏
QMenuBar* menuBar = this->menuBar();
// 文件菜单
QMenu* fileMenu = menuBar->addMenu("文件(&F)");
QAction* exitAction = new QAction("退出(&X)", this);
exitAction->setShortcut(QKeySequence::Quit);
connect(exitAction, &QAction::triggered, this, &QWidget::close);
fileMenu->addAction(exitAction);
// 设置菜单
QMenu* settingsMenu = menuBar->addMenu("设置(&S)");
QAction* modelSettingsAction = new QAction("模型设置(&M)...", this);
modelSettingsAction->setShortcut(QKeySequence("Ctrl+M"));
modelSettingsAction->setToolTip("配置ASR和TTS模型");
connect(modelSettingsAction, &QAction::triggered, this, &SpeechTestMainWindow::openModelSettings);
settingsMenu->addAction(modelSettingsAction);
// 帮助菜单
QMenu* helpMenu = menuBar->addMenu("帮助(&H)");
QAction* aboutAction = new QAction("关于(&A)...", this);
connect(aboutAction, &QAction::triggered, [this]() {
QMessageBox::about(this, "关于",
"QSmartAssistant 语音测试工具 v1.0\n\n"
"基于sherpa-onnx的语音识别和合成工具\n"
"支持中英文混合语音合成");
});
helpMenu->addAction(aboutAction);
}
void SpeechTestMainWindow::createOutputDirectories() {
// 创建TTS输出目录
QString ttsOutputDir = QDir::currentPath() + "/tts_output";
if (!QDir().exists(ttsOutputDir)) {
QDir().mkpath(ttsOutputDir);
qDebug() << "创建TTS输出目录:" << ttsOutputDir;
}
// 创建录音输出目录
QString recordOutputDir = QDir::currentPath() + "/recordings";
if (!QDir().exists(recordOutputDir)) {
QDir().mkpath(recordOutputDir);
qDebug() << "创建录音输出目录:" << recordOutputDir;
}
}
void SpeechTestMainWindow::connectSignals() {
// 通过对象名称查找按钮并连接信号
QPushButton* browseBtn = findChild<QPushButton*>("browseBtn");
QPushButton* recognizeBtn = findChild<QPushButton*>("recognizeBtn");
QPushButton* synthesizeBtn = findChild<QPushButton*>("synthesizeBtn");
if (browseBtn) {
connect(browseBtn, &QPushButton::clicked, this, &SpeechTestMainWindow::browseFile);
}
if (recognizeBtn) {
connect(recognizeBtn, &QPushButton::clicked, this, &SpeechTestMainWindow::startRecognition);
}
if (synthesizeBtn) {
connect(synthesizeBtn, &QPushButton::clicked, this, &SpeechTestMainWindow::startSynthesis);
}
// 连接麦克风按钮信号
connect(micRecordBtn, &QPushButton::clicked, this, &SpeechTestMainWindow::startMicRecognition);
connect(micStopBtn, &QPushButton::clicked, this, &SpeechTestMainWindow::stopMicRecognition);
// 连接录音按钮信号
connect(recordBtn, &QPushButton::clicked, this, &SpeechTestMainWindow::startRecording);
connect(recordStopBtn, &QPushButton::clicked, this, &SpeechTestMainWindow::stopRecording);
// 连接语音唤醒按钮信号
connect(kwsStartBtn, &QPushButton::clicked, this, &SpeechTestMainWindow::startKWS);
connect(kwsStopBtn, &QPushButton::clicked, this, &SpeechTestMainWindow::stopKWS);
}
void SpeechTestMainWindow::browseFile() {
QString fileName = QFileDialog::getOpenFileName(
this, "选择WAV音频文件", "", "WAV Files (*.wav)");
if (!fileName.isEmpty()) {
filePathEdit->setText(fileName);
}
}
void SpeechTestMainWindow::startRecognition() {
QString filePath = filePathEdit->text().trimmed();
if (filePath.isEmpty()) {
QMessageBox::warning(this, "警告", "请先选择音频文件");
return;
}
if (!QFile::exists(filePath)) {
QMessageBox::warning(this, "警告", "文件不存在: " + filePath);
return;
}
if (!asrManager->isInitialized()) {
QMessageBox::critical(this, "错误", "ASR模型未初始化");
return;
}
asrResultEdit->clear();
asrResultEdit->append("正在识别,请稍候...");
statusBar()->showMessage("正在进行语音识别...");
// 使用QTimer延迟执行避免界面卡顿
QTimer::singleShot(100, this, [this, filePath]() {
QString result = asrManager->recognizeWavFile(filePath);
asrResultEdit->clear();
asrResultEdit->append("识别结果: " + result);
statusBar()->showMessage("语音识别完成");
});
}
void SpeechTestMainWindow::startSynthesis() {
QString text = ttsTextEdit->toPlainText().trimmed();
if (text.isEmpty()) {
QMessageBox::warning(this, "警告", "请输入要合成的文本");
return;
}
if (!ttsManager->isInitialized()) {
QMessageBox::critical(this, "错误", "TTS模型未初始化");
return;
}
int speakerId = speakerIdSpinBox->value();
// 创建项目目录下的输出文件夹
QString outputDir = QDir::currentPath() + "/tts_output";
QDir().mkpath(outputDir);
QString outputPath = outputDir + "/tts_" +
QDateTime::currentDateTime().toString("yyyyMMdd_hhmmss") +
"_speaker" + QString::number(speakerId) + ".wav";
ttsResultEdit->clear();
ttsResultEdit->append("正在合成,请稍候...");
statusBar()->showMessage("正在进行语音合成...");
// 使用QTimer延迟执行避免界面卡顿
QTimer::singleShot(100, this, [this, text, speakerId, outputPath]() {
bool success = ttsManager->synthesizeText(text, speakerId, outputPath);
ttsResultEdit->clear();
if (success) {
ttsResultEdit->append("语音合成成功");
// 显示相对路径,更简洁
QString relativePath = QDir::current().relativeFilePath(outputPath);
ttsResultEdit->append("输出文件: " + relativePath);
ttsResultEdit->append("完整路径: " + outputPath);
statusBar()->showMessage("语音合成完成,保存至: " + relativePath);
// 询问是否播放
int ret = QMessageBox::question(this, "合成完成",
"语音合成完成!是否要播放生成的音频?\n\n文件位置: " + outputPath,
QMessageBox::Yes | QMessageBox::No);
if (ret == QMessageBox::Yes) {
// 在macOS上使用afplay播放音频
QProcess::startDetached("afplay", QStringList() << outputPath);
}
} else {
ttsResultEdit->append("语音合成失败");
statusBar()->showMessage("语音合成失败");
}
});
}
void SpeechTestMainWindow::startMicRecognition() {
if (!asrManager->isOnlineInitialized()) {
QMessageBox::information(this, "功能不可用",
"在线识别模型未初始化。\n"
"请确保sherpa-onnx-streaming-paraformer-bilingual-zh-en模型已正确安装。");
return;
}
if (isRecording) {
return;
}
// 提示用户检查麦克风权限
qDebug() << "开始麦克风识别,请确保已授予麦克风权限";
// 获取默认音频设备
QAudioDevice defaultDevice = QMediaDevices::defaultAudioInput();
qDebug() << "默认音频设备:" << defaultDevice.description();
qDebug() << "设备ID:" << defaultDevice.id();
// 首先尝试使用设备的首选格式
QAudioFormat preferredFormat = defaultDevice.preferredFormat();
qDebug() << "设备首选格式 - 采样率:" << preferredFormat.sampleRate()
<< "声道:" << preferredFormat.channelCount()
<< "格式:" << static_cast<int>(preferredFormat.sampleFormat());
// 使用设备支持的最佳格式进行录制然后转换为16kHz单声道
QAudioFormat format;
// 优先尝试高质量格式
QList<int> preferredSampleRates = {48000, 44100, 22050, 16000};
QList<int> preferredChannels = {2, 1}; // 优先立体声
QList<QAudioFormat::SampleFormat> preferredFormats = {QAudioFormat::Int16, QAudioFormat::Float};
bool formatFound = false;
// 寻找设备支持的最佳格式
for (int sampleRate : preferredSampleRates) {
for (int channels : preferredChannels) {
for (QAudioFormat::SampleFormat sampleFormat : preferredFormats) {
format.setSampleRate(sampleRate);
format.setChannelCount(channels);
format.setSampleFormat(sampleFormat);
if (defaultDevice.isFormatSupported(format)) {
qDebug() << "找到最佳支持格式 - 采样率:" << sampleRate
<< "声道:" << channels
<< "格式:" << static_cast<int>(sampleFormat);
formatFound = true;
break;
}
}
if (formatFound) break;
}
if (formatFound) break;
}
if (!formatFound) {
// 如果都不支持,使用设备首选格式
format = preferredFormat;
qDebug() << "使用设备首选格式";
}
qDebug() << "最终使用的音频格式 - 采样率:" << format.sampleRate()
<< "声道:" << format.channelCount()
<< "格式:" << static_cast<int>(format.sampleFormat());
// 创建在线流
onlineStream = asrManager->createOnlineStream();
if (!onlineStream) {
QMessageBox::critical(this, "错误", "无法创建在线识别流");
return;
}
qDebug() << "在线识别流创建成功";
// 保存音频格式信息用于后续处理
currentAudioFormat = format;
originalSampleRate = format.sampleRate();
originalChannelCount = format.channelCount();
// 创建音频源 - 使用更保守的设置
audioSource = new QAudioSource(defaultDevice, format, this);
// 使用较小的缓冲区,有时大缓冲区会导致问题
audioSource->setBufferSize(4096);
// 设置音量
audioSource->setVolume(1.0);
// 连接状态变化信号
connect(audioSource, &QAudioSource::stateChanged, this, [this](QAudio::State state) {
qDebug() << "音频源状态变化:" << state;
if (state == QAudio::StoppedState) {
qDebug() << "音频源错误:" << audioSource->error();
} else if (state == QAudio::ActiveState) {
qDebug() << "音频源已激活!";
}
});
qDebug() << "尝试启动音频输入...";
// 启动音频输入
audioDevice = audioSource->start();
if (!audioDevice) {
qDebug() << "第一次启动失败,尝试其他方法...";
// 尝试使用pull模式
QByteArray buffer;
buffer.resize(4096);
audioDevice = audioSource->start();
if (!audioDevice) {
QMessageBox::critical(this, "错误", "无法启动音频输入,请检查麦克风权限");
asrManager->destroyOnlineStream(onlineStream);
onlineStream = nullptr;
delete audioSource;
audioSource = nullptr;
return;
}
}
qDebug() << "音频输入启动成功";
qDebug() << "初始音频源状态:" << audioSource->state();
qDebug() << "音频源错误:" << audioSource->error();
qDebug() << "缓冲区大小:" << audioSource->bufferSize();
// 等待音频源状态稳定并进行测试
QTimer::singleShot(200, this, [this]() {
if (audioSource) {
qDebug() << "音频源最终状态:" << audioSource->state();
qDebug() << "音频源错误状态:" << audioSource->error();
// 尝试强制激活音频源
if (audioSource->state() == QAudio::IdleState) {
qDebug() << "音频源处于空闲状态,尝试多种激活方法...";
// 方法1暂停和恢复
audioSource->suspend();
QTimer::singleShot(50, this, [this]() {
if (audioSource) {
audioSource->resume();
qDebug() << "方法1恢复后状态:" << audioSource->state();
// 方法2如果仍然是IdleState尝试重新创建
if (audioSource->state() == QAudio::IdleState) {
QTimer::singleShot(100, this, [this]() {
if (audioSource) {
qDebug() << "尝试重新创建音频源...";
audioSource->stop();
delete audioSource;
// 重新创建音频源
QAudioDevice device = QMediaDevices::defaultAudioInput();
audioSource = new QAudioSource(device, currentAudioFormat, this);
audioSource->setBufferSize(16384);
// 重新连接信号
connect(audioSource, &QAudioSource::stateChanged, this, [this](QAudio::State state) {
qDebug() << "重新创建后音频源状态变化:" << state;
});
audioDevice = audioSource->start();
qDebug() << "重新创建后音频源状态:" << audioSource->state();
}
});
}
}
});
}
// 显示麦克风权限提示
if (audioSource->state() != QAudio::ActiveState) {
statusBar()->showMessage("提示:如果没有声音输入,请检查系统设置中的麦克风权限");
asrResultEdit->append("提示:请确保已在系统设置 → 安全性与隐私 → 麦克风中授予权限");
}
}
});
// 创建定时器读取音频数据
audioTimer = new QTimer(this);
connect(audioTimer, &QTimer::timeout, this, &SpeechTestMainWindow::processAudioData);
audioTimer->start(100); // 每100ms处理一次音频数据
// 添加一个备用定时器,用于强制检查音频状态
QTimer* statusTimer = new QTimer(this);
connect(statusTimer, &QTimer::timeout, this, [this]() {
if (audioSource && isRecording) {
static int checkCount = 0;
checkCount++;
if (checkCount % 10 == 0) { // 每秒检查一次
qDebug() << "状态检查 - 音频源状态:" << audioSource->state()
<< "错误:" << audioSource->error()
<< "可用字节:" << (audioDevice ? audioDevice->bytesAvailable() : 0);
// 如果长时间处于IdleState尝试重新启动
if (audioSource->state() == QAudio::IdleState && checkCount > 50) {
qDebug() << "长时间空闲,尝试重新启动音频源...";
audioSource->stop();
QTimer::singleShot(100, this, [this]() {
if (audioSource && isRecording) {
audioDevice = audioSource->start();
}
});
checkCount = 0;
}
}
}
});
statusTimer->start(100);
isRecording = true;
micRecordBtn->setEnabled(false);
micStopBtn->setEnabled(true);
micRecordBtn->setText("识别中...");
asrResultEdit->clear();
asrResultEdit->append("开始麦克风识别,请说话...");
statusBar()->showMessage("正在进行麦克风识别...");
qDebug() << "麦克风识别已启动";
}
void SpeechTestMainWindow::stopMicRecognition() {
if (!isRecording) {
return;
}
isRecording = false;
// 停止音频输入
if (audioSource) {
audioSource->stop();
delete audioSource;
audioSource = nullptr;
}
// 停止定时器
if (audioTimer) {
audioTimer->stop();
delete audioTimer;
audioTimer = nullptr;
}
// 获取最终识别结果
if (onlineStream) {
asrManager->inputFinished(onlineStream);
// 等待最后的识别结果
QTimer::singleShot(500, this, [this]() {
if (onlineStream) {
QString finalText = asrManager->getStreamResult(onlineStream);
if (!finalText.isEmpty()) {
asrResultEdit->append("最终识别结果: " + finalText);
}
asrManager->destroyOnlineStream(onlineStream);
onlineStream = nullptr;
}
});
}
micRecordBtn->setEnabled(true);
micStopBtn->setEnabled(false);
micRecordBtn->setText("开始麦克风识别");
statusBar()->showMessage("麦克风识别已停止");
qDebug() << "麦克风识别已停止";
}
void SpeechTestMainWindow::processAudioData() {
if (!audioDevice || !onlineStream || !isRecording) {
return;
}
// 检查音频源状态,但不立即返回
if (audioSource->state() != QAudio::ActiveState) {
static int idleCount = 0;
idleCount++;
if (idleCount % 50 == 0) { // 每50次输出一次警告
qDebug() << "音频源状态异常:" << audioSource->state() << "错误:" << audioSource->error();
}
// 尝试重新启动音频源
if (idleCount > 100 && audioSource->state() == QAudio::IdleState) {
qDebug() << "尝试重新启动音频源...";
audioSource->stop();
audioDevice = audioSource->start();
idleCount = 0;
}
// 即使状态异常,也尝试读取数据
}
// 强制读取音频数据即使状态不是Active
QByteArray audioData;
if (audioDevice) {
audioData = audioDevice->readAll();
// 如果没有数据,尝试直接从音频源读取
if (audioData.isEmpty() && audioSource) {
qint64 bytesAvailable = audioDevice->bytesAvailable();
if (bytesAvailable > 0) {
audioData = audioDevice->read(std::min(bytesAvailable, qint64(4096)));
}
}
}
if (audioData.isEmpty()) {
return;
}
static int totalSamples = 0;
static int callCount = 0;
callCount++;
// 每100次调用输出一次调试信息
if (callCount % 100 == 0) {
qDebug() << "原始音频数据 - 调用次数:" << callCount
<< "数据大小:" << audioData.size() << "字节"
<< "格式:" << currentAudioFormat.sampleRate() << "Hz"
<< currentAudioFormat.channelCount() << "声道";
}
// 定义目标格式(语音识别需要的格式)
QAudioFormat targetFormat;
targetFormat.setSampleRate(16000);
targetFormat.setChannelCount(1);
targetFormat.setSampleFormat(QAudioFormat::Float);
// 使用音频格式转换方法
QByteArray convertedData = convertAudioFormat(audioData, currentAudioFormat, targetFormat);
if (convertedData.isEmpty()) {
return;
}
// 转换后的数据已经是16kHz单声道浮点格式
const float* samples = reinterpret_cast<const float*>(convertedData.data());
int sampleCount = convertedData.size() / sizeof(float);
totalSamples += sampleCount;
if (callCount % 100 == 0) {
qDebug() << "转换后音频数据 - 样本数:" << sampleCount
<< "总样本数:" << totalSamples;
}
// 发送音频数据到识别器
if (sampleCount > 0) {
asrManager->acceptWaveform(onlineStream, samples, sampleCount);
}
// 检查是否有识别结果
int decodeCount = 0;
while (asrManager->isStreamReady(onlineStream)) {
asrManager->decodeStream(onlineStream);
decodeCount++;
if (decodeCount > 10) break; // 防止无限循环
}
// 获取部分识别结果
QString partialText = asrManager->getStreamResult(onlineStream);
if (!partialText.isEmpty()) {
qDebug() << "识别到文本:" << partialText;
// 更新显示(这里显示实时识别结果)
statusBar()->showMessage("识别中: " + partialText);
// 检查是否检测到端点
if (asrManager->isEndpoint(onlineStream)) {
asrResultEdit->append("识别片段: " + partialText);
qDebug() << "检测到端点,重置流";
// 重置流以继续识别
asrManager->destroyOnlineStream(onlineStream);
onlineStream = asrManager->createOnlineStream();
}
} else {
// 即使没有文本,也显示正在处理的状态
if (callCount % 20 == 0) { // 每20次调用更新一次状态
// 计算音频电平
float maxLevel = 0.0f;
for (int i = 0; i < sampleCount; i++) {
maxLevel = std::max(maxLevel, std::abs(samples[i]));
}
QString statusMsg = QString("正在监听... (样本: %1, 电平: %2)")
.arg(totalSamples)
.arg(maxLevel, 0, 'f', 3);
statusBar()->showMessage(statusMsg);
// 如果检测到音频信号
if (maxLevel > 0.01f) {
qDebug() << "检测到音频信号,电平:" << maxLevel;
}
}
}
}
void SpeechTestMainWindow::openModelSettings() {
ModelSettingsDialog dialog(this);
// 设置当前配置
ModelConfig offlineAsrConfig;
offlineAsrConfig.modelPath = ""; // 从ASRManager获取当前配置
dialog.setCurrentOfflineASRConfig(offlineAsrConfig);
ModelConfig onlineAsrConfig;
onlineAsrConfig.modelPath = ""; // 从ASRManager获取当前配置
dialog.setCurrentOnlineASRConfig(onlineAsrConfig);
ModelConfig kwsConfig;
kwsConfig.modelPath = ""; // 从KWS管理器获取当前配置
dialog.setCurrentKWSConfig(kwsConfig);
ModelConfig ttsConfig;
ttsConfig.modelPath = ""; // 从TTSManager获取当前配置
dialog.setCurrentTTSConfig(ttsConfig);
// 连接信号
connect(&dialog, &ModelSettingsDialog::modelsChanged,
this, &SpeechTestMainWindow::onModelsChanged);
dialog.exec();
}
void SpeechTestMainWindow::onModelsChanged() {
// 重新初始化模型
reinitializeModels();
// 更新状态栏
bool asrOk = asrManager->isInitialized();
bool ttsOk = ttsManager->isInitialized();
bool kwsOk = kwsManager->isInitialized();
if (asrOk && ttsOk && kwsOk) {
QString modelInfo = ttsManager->isMultilingualModel() ? "(支持中英文混合)" : "(仅支持中文)";
QString micInfo = asrManager->isOnlineInitialized() ? "" : ",麦克风识别暂不可用";
QString kwsInfo = kwsOk ? ",语音唤醒可用" : ",语音唤醒不可用";
statusBar()->showMessage("模型重新加载成功 " + modelInfo + micInfo + kwsInfo);
} else {
statusBar()->showMessage("模型重新加载失败");
}
}
void SpeechTestMainWindow::reinitializeModels() {
// 如果KWS正在运行先停止它
bool wasKWSActive = isKWSActive;
if (isKWSActive) {
stopKWS();
}
// 重新初始化ASR管理器
bool asrOk = asrManager->initialize();
// 重新初始化TTS管理器
bool ttsOk = ttsManager->initialize();
// 重新初始化KWS管理器
bool kwsOk = kwsManager->initialize();
// 尝试初始化在线识别器
asrManager->initializeOnlineRecognizer();
qDebug() << "模型重新初始化 - ASR:" << (asrOk ? "成功" : "失败")
<< "TTS:" << (ttsOk ? "成功" : "失败")
<< "KWS:" << (kwsOk ? "成功" : "失败");
// 如果之前KWS是激活的重新启动它
if (wasKWSActive && kwsOk) {
QTimer::singleShot(1000, this, &SpeechTestMainWindow::startKWS);
qDebug() << "将在1秒后重新启动KWS";
}
}
void SpeechTestMainWindow::startRecording() {
if (isRecordingWav) {
return;
}
// 检查是否正在进行语音识别
if (isRecording) {
QMessageBox::information(this, "提示", "请先停止语音识别再开始录音");
return;
}
qDebug() << "开始WAV录音";
// 获取默认音频设备
QAudioDevice defaultDevice = QMediaDevices::defaultAudioInput();
qDebug() << "录音设备:" << defaultDevice.description();
// 获取录音设置(设备参数)
int recordSampleRate = recordSampleRateComboBox->currentData().toInt();
int recordChannels = recordChannelComboBox->currentData().toInt();
// 获取输出设置(保存格式)
int outputSampleRate = outputSampleRateComboBox->currentData().toInt();
int outputChannels = outputChannelComboBox->currentData().toInt();
qDebug() << "录音设置 - 采样率:" << recordSampleRate << "Hz, 声道:" << recordChannels;
qDebug() << "输出设置 - 采样率:" << outputSampleRate << "Hz, 声道:" << outputChannels;
// 确定实际录音格式
QAudioFormat deviceOptimalFormat;
if (recordSampleRate == -1 || recordChannels == -1) {
// 自动检测设备最佳格式
qDebug() << "自动检测设备最佳录音格式...";
QList<int> deviceSampleRates = {48000, 44100, 22050, 16000};
QList<int> deviceChannels = {2, 1};
QList<QAudioFormat::SampleFormat> deviceFormats = {QAudioFormat::Int16, QAudioFormat::Float};
bool foundDeviceFormat = false;
for (int sampleRate : deviceSampleRates) {
for (int channels : deviceChannels) {
for (QAudioFormat::SampleFormat format : deviceFormats) {
deviceOptimalFormat.setSampleRate(sampleRate);
deviceOptimalFormat.setChannelCount(channels);
deviceOptimalFormat.setSampleFormat(format);
if (defaultDevice.isFormatSupported(deviceOptimalFormat)) {
qDebug() << "找到设备最佳格式:" << sampleRate << "Hz,"
<< channels << "声道," << static_cast<int>(format);
foundDeviceFormat = true;
break;
}
}
if (foundDeviceFormat) break;
}
if (foundDeviceFormat) break;
}
if (!foundDeviceFormat) {
deviceOptimalFormat = defaultDevice.preferredFormat();
qDebug() << "使用设备首选格式";
}
} else {
// 使用用户指定的录音格式
deviceOptimalFormat.setSampleRate(recordSampleRate);
deviceOptimalFormat.setChannelCount(recordChannels);
deviceOptimalFormat.setSampleFormat(QAudioFormat::Int16);
// 检查用户指定格式是否被支持
if (!defaultDevice.isFormatSupported(deviceOptimalFormat)) {
qDebug() << "用户指定的录音格式不被支持,自动寻找最佳格式...";
// 回退到自动检测
QList<int> deviceSampleRates = {recordSampleRate, 48000, 44100, 22050, 16000};
QList<int> deviceChannels = {recordChannels, 2, 1};
QList<QAudioFormat::SampleFormat> deviceFormats = {QAudioFormat::Int16, QAudioFormat::Float};
bool foundDeviceFormat = false;
for (int sampleRate : deviceSampleRates) {
for (int channels : deviceChannels) {
for (QAudioFormat::SampleFormat format : deviceFormats) {
deviceOptimalFormat.setSampleRate(sampleRate);
deviceOptimalFormat.setChannelCount(channels);
deviceOptimalFormat.setSampleFormat(format);
if (defaultDevice.isFormatSupported(deviceOptimalFormat)) {
qDebug() << "找到兼容格式:" << sampleRate << "Hz,"
<< channels << "声道," << static_cast<int>(format);
foundDeviceFormat = true;
break;
}
}
if (foundDeviceFormat) break;
}
if (foundDeviceFormat) break;
}
if (!foundDeviceFormat) {
deviceOptimalFormat = defaultDevice.preferredFormat();
qDebug() << "使用设备首选格式";
}
}
}
// 使用确定的设备格式进行录制
recordAudioFormat = deviceOptimalFormat;
// 检查格式支持并智能降级
QString formatInfo = QString("尝试格式: %1 Hz, %2声道")
.arg(recordAudioFormat.sampleRate())
.arg(recordAudioFormat.channelCount() == 1 ? "" : "立体");
qDebug() << formatInfo;
if (!defaultDevice.isFormatSupported(recordAudioFormat)) {
qDebug() << "设备不支持选择的格式,尝试降级...";
// 如果是立体声,尝试单声道
if (recordAudioFormat.channelCount() == 2) {
recordAudioFormat.setChannelCount(1);
qDebug() << "尝试单声道格式";
if (!defaultDevice.isFormatSupported(recordAudioFormat)) {
// 尝试降低采样率
QList<int> fallbackRates = {44100, 22050, 16000, 8000};
bool foundSupported = false;
for (int rate : fallbackRates) {
if (rate < recordSampleRate) {
recordAudioFormat.setSampleRate(rate);
if (defaultDevice.isFormatSupported(recordAudioFormat)) {
qDebug() << "降级到采样率:" << rate << "Hz";
foundSupported = true;
break;
}
}
}
if (!foundSupported) {
// 最后使用设备首选格式
recordAudioFormat = defaultDevice.preferredFormat();
qDebug() << "使用设备首选录音格式";
}
}
} else {
// 单声道情况下,尝试降低采样率
QList<int> fallbackRates = {44100, 22050, 16000, 8000};
bool foundSupported = false;
for (int rate : fallbackRates) {
if (rate < recordSampleRate) {
recordAudioFormat.setSampleRate(rate);
if (defaultDevice.isFormatSupported(recordAudioFormat)) {
qDebug() << "降级到采样率:" << rate << "Hz";
foundSupported = true;
break;
}
}
}
if (!foundSupported) {
recordAudioFormat = defaultDevice.preferredFormat();
qDebug() << "使用设备首选录音格式";
}
}
// 显示实际使用的格式
QString actualFormat = QString("实际使用格式: %1 Hz, %2声道")
.arg(recordAudioFormat.sampleRate())
.arg(recordAudioFormat.channelCount() == 1 ? "" : "立体");
qDebug() << actualFormat;
// 如果格式发生了变化,通知用户
if (recordAudioFormat.sampleRate() != recordSampleRate ||
recordAudioFormat.channelCount() != recordChannels) {
recordResultEdit->append("注意:设备不支持选择的格式,已自动调整");
}
}
qDebug() << "录音格式 - 采样率:" << recordAudioFormat.sampleRate()
<< "声道:" << recordAudioFormat.channelCount()
<< "格式:" << static_cast<int>(recordAudioFormat.sampleFormat());
// 创建输出文件路径
QString outputDir = QDir::currentPath() + "/recordings";
QDir().mkpath(outputDir);
currentRecordingPath = outputDir + "/recording_" +
QDateTime::currentDateTime().toString("yyyyMMdd_hhmmss") +
".wav";
// 清空录音数据缓冲区
recordedData.clear();
// 创建音频源
recordAudioSource = new QAudioSource(defaultDevice, recordAudioFormat, this);
recordAudioSource->setBufferSize(8192);
recordAudioSource->setVolume(1.0);
// 连接状态变化信号
connect(recordAudioSource, &QAudioSource::stateChanged, this, [this](QAudio::State state) {
qDebug() << "录音音频源状态变化:" << state;
if (state == QAudio::StoppedState) {
qDebug() << "录音音频源错误:" << recordAudioSource->error();
} else if (state == QAudio::ActiveState) {
qDebug() << "录音音频源已激活!";
}
});
// 启动音频输入
recordAudioDevice = recordAudioSource->start();
if (!recordAudioDevice) {
QMessageBox::critical(this, "错误", "无法启动录音,请检查麦克风权限");
delete recordAudioSource;
recordAudioSource = nullptr;
return;
}
// 创建定时器读取音频数据
recordTimer = new QTimer(this);
connect(recordTimer, &QTimer::timeout, this, &SpeechTestMainWindow::processRecordingData);
recordTimer->start(100); // 每100ms处理一次音频数据
isRecordingWav = true;
recordBtn->setEnabled(false);
recordStopBtn->setEnabled(true);
recordBtn->setText("录音中...");
// 录音期间禁用设置选项
recordSampleRateComboBox->setEnabled(false);
recordChannelComboBox->setEnabled(false);
outputSampleRateComboBox->setEnabled(false);
outputChannelComboBox->setEnabled(false);
recordResultEdit->clear();
recordResultEdit->append("开始录音,请说话...");
recordResultEdit->append(QString("录音格式: %1 Hz, %2")
.arg(recordAudioFormat.sampleRate())
.arg(recordAudioFormat.channelCount() == 1 ? "单声道" : "立体声"));
recordResultEdit->append(QString("输出格式: %1 Hz, %2")
.arg(outputSampleRate)
.arg(outputChannels == 1 ? "单声道" : "立体声"));
recordResultEdit->append("输出文件: " + QDir::current().relativeFilePath(currentRecordingPath));
statusBar()->showMessage("正在录音...");
qDebug() << "WAV录音已启动输出文件:" << currentRecordingPath;
}
void SpeechTestMainWindow::stopRecording() {
if (!isRecordingWav) {
return;
}
isRecordingWav = false;
// 停止音频输入
if (recordAudioSource) {
recordAudioSource->stop();
delete recordAudioSource;
recordAudioSource = nullptr;
}
// 停止定时器
if (recordTimer) {
recordTimer->stop();
delete recordTimer;
recordTimer = nullptr;
}
recordBtn->setEnabled(true);
recordStopBtn->setEnabled(false);
recordBtn->setText("开始录音");
// 重新启用设置选项
recordSampleRateComboBox->setEnabled(true);
recordChannelComboBox->setEnabled(true);
outputSampleRateComboBox->setEnabled(true);
outputChannelComboBox->setEnabled(true);
// 保存WAV文件
if (!recordedData.isEmpty()) {
// 获取输出设置
int outputSampleRate = outputSampleRateComboBox->currentData().toInt();
int outputChannels = outputChannelComboBox->currentData().toInt();
QAudioFormat outputFormat;
outputFormat.setSampleRate(outputSampleRate);
outputFormat.setChannelCount(outputChannels);
outputFormat.setSampleFormat(QAudioFormat::Int16);
QByteArray finalAudioData = recordedData;
QAudioFormat finalFormat = recordAudioFormat;
// 如果录制格式与输出格式不同,进行转换
if (recordAudioFormat.sampleRate() != outputSampleRate ||
recordAudioFormat.channelCount() != outputChannels) {
qDebug() << "转换录音格式从" << recordAudioFormat.sampleRate() << "Hz"
<< recordAudioFormat.channelCount() << "声道到"
<< outputSampleRate << "Hz" << outputChannels << "声道";
finalAudioData = convertAudioFormat(recordedData, recordAudioFormat, outputFormat);
finalFormat = outputFormat;
if (finalAudioData.isEmpty()) {
recordResultEdit->append("音频格式转换失败!");
statusBar()->showMessage("录音保存失败 - 格式转换错误");
return;
}
recordResultEdit->append("✅ 音频格式转换完成");
} else {
recordResultEdit->append("✅ 录音格式与输出格式一致,无需转换");
}
// 保存输出格式的文件
bool success = saveWavFile(currentRecordingPath, finalAudioData, finalFormat);
if (success) {
QFileInfo fileInfo(currentRecordingPath);
double durationSeconds = (double)finalAudioData.size() /
(finalFormat.sampleRate() *
finalFormat.channelCount() *
(finalFormat.sampleFormat() == QAudioFormat::Int16 ? 2 : 4));
recordResultEdit->append(QString("🎉 录音完成!时长: %1 秒").arg(durationSeconds, 0, 'f', 1));
recordResultEdit->append(QString("📊 最终格式: %1 Hz, %2, 16位")
.arg(finalFormat.sampleRate())
.arg(finalFormat.channelCount() == 1 ? "单声道" : "立体声"));
recordResultEdit->append(QString("📁 文件大小: %1 KB").arg(fileInfo.size() / 1024.0, 0, 'f', 1));
recordResultEdit->append("📂 完整路径: " + currentRecordingPath);
statusBar()->showMessage("录音已保存: " + QDir::current().relativeFilePath(currentRecordingPath));
// 询问是否播放录音
int ret = QMessageBox::question(this, "录音完成",
QString("录音已保存!\n文件: %1\n时长: %2 秒\n\n是否要播放录音?")
.arg(QDir::current().relativeFilePath(currentRecordingPath))
.arg(durationSeconds, 0, 'f', 1),
QMessageBox::Yes | QMessageBox::No);
if (ret == QMessageBox::Yes) {
// 在macOS上使用afplay播放音频
QProcess::startDetached("afplay", QStringList() << currentRecordingPath);
}
} else {
recordResultEdit->append("录音保存失败!");
statusBar()->showMessage("录音保存失败");
}
} else {
recordResultEdit->append("没有录制到音频数据");
statusBar()->showMessage("录音失败 - 没有数据");
}
qDebug() << "WAV录音已停止";
}
void SpeechTestMainWindow::processRecordingData() {
if (!recordAudioDevice || !isRecordingWav) {
return;
}
// 读取音频数据
QByteArray audioData = recordAudioDevice->readAll();
if (!audioData.isEmpty()) {
// 将数据添加到录音缓冲区
recordedData.append(audioData);
// 更新录音状态显示
static int updateCount = 0;
updateCount++;
if (updateCount % 10 == 0) { // 每秒更新一次
double durationSeconds = (double)recordedData.size() /
(recordAudioFormat.sampleRate() *
recordAudioFormat.channelCount() *
(recordAudioFormat.sampleFormat() == QAudioFormat::Int16 ? 2 : 4));
statusBar()->showMessage(QString("录音中... %1 秒").arg(durationSeconds, 0, 'f', 1));
}
}
}
bool SpeechTestMainWindow::saveWavFile(const QString& filePath, const QByteArray& audioData, const QAudioFormat& format) {
QFile file(filePath);
if (!file.open(QIODevice::WriteOnly)) {
qDebug() << "无法创建WAV文件:" << filePath;
return false;
}
// WAV文件头
QDataStream stream(&file);
stream.setByteOrder(QDataStream::LittleEndian);
// RIFF头
stream.writeRawData("RIFF", 4);
quint32 fileSize = 36 + audioData.size();
stream << fileSize;
stream.writeRawData("WAVE", 4);
// fmt子块
stream.writeRawData("fmt ", 4);
quint32 fmtSize = 16;
stream << fmtSize;
quint16 audioFormat = 1; // PCM
stream << audioFormat;
quint16 numChannels = format.channelCount();
stream << numChannels;
quint32 sampleRate = format.sampleRate();
stream << sampleRate;
quint16 bitsPerSample = (format.sampleFormat() == QAudioFormat::Int16) ? 16 : 32;
quint32 byteRate = sampleRate * numChannels * (bitsPerSample / 8);
stream << byteRate;
quint16 blockAlign = numChannels * (bitsPerSample / 8);
stream << blockAlign;
stream << bitsPerSample;
// data子块
stream.writeRawData("data", 4);
quint32 dataSize = audioData.size();
stream << dataSize;
// 写入音频数据
stream.writeRawData(audioData.constData(), audioData.size());
file.close();
qDebug() << "WAV文件保存成功:" << filePath;
qDebug() << "文件大小:" << (fileSize + 8) << "字节";
qDebug() << "音频格式:" << numChannels << "声道," << sampleRate << "Hz," << bitsPerSample << "";
return true;
}
QByteArray SpeechTestMainWindow::convertAudioFormat(const QByteArray& inputData,
const QAudioFormat& inputFormat,
const QAudioFormat& outputFormat) {
if (inputData.isEmpty()) {
return QByteArray();
}
// 如果格式相同,直接返回
if (inputFormat.sampleRate() == outputFormat.sampleRate() &&
inputFormat.channelCount() == outputFormat.channelCount() &&
inputFormat.sampleFormat() == outputFormat.sampleFormat()) {
return inputData;
}
// qDebug() << "音频格式转换:"
// << inputFormat.sampleRate() << "Hz" << inputFormat.channelCount() << "声道"
// << "→"
// << outputFormat.sampleRate() << "Hz" << outputFormat.channelCount() << "声道";
// 第一步:转换为浮点格式
std::vector<float> samples;
int inputSampleCount = 0;
if (inputFormat.sampleFormat() == QAudioFormat::Int16) {
const int16_t* intData = reinterpret_cast<const int16_t*>(inputData.data());
inputSampleCount = inputData.size() / 2;
samples.resize(inputSampleCount);
for (int i = 0; i < inputSampleCount; i++) {
samples[i] = intData[i] / 32768.0f;
}
} else if (inputFormat.sampleFormat() == QAudioFormat::Float) {
const float* floatData = reinterpret_cast<const float*>(inputData.data());
inputSampleCount = inputData.size() / sizeof(float);
samples.assign(floatData, floatData + inputSampleCount);
} else {
qDebug() << "不支持的输入音频格式:" << static_cast<int>(inputFormat.sampleFormat());
return QByteArray();
}
// 第二步:处理多声道转单声道
if (inputFormat.channelCount() > outputFormat.channelCount() && outputFormat.channelCount() == 1) {
std::vector<float> monoSamples;
int frameCount = inputSampleCount / inputFormat.channelCount();
monoSamples.reserve(frameCount);
for (int frame = 0; frame < frameCount; frame++) {
float sum = 0.0f;
for (int ch = 0; ch < inputFormat.channelCount(); ch++) {
int index = frame * inputFormat.channelCount() + ch;
if (index < inputSampleCount) {
sum += samples[index];
}
}
monoSamples.push_back(sum / inputFormat.channelCount());
}
samples = std::move(monoSamples);
inputSampleCount = samples.size();
}
// 第三步:重采样
if (inputFormat.sampleRate() != outputFormat.sampleRate()) {
std::vector<float> resampledSamples;
float ratio = static_cast<float>(outputFormat.sampleRate()) / inputFormat.sampleRate();
int newSampleCount = static_cast<int>(inputSampleCount * ratio);
resampledSamples.reserve(newSampleCount);
for (int i = 0; i < newSampleCount; i++) {
float srcIndex = i / ratio;
int index = static_cast<int>(srcIndex);
if (index < inputSampleCount - 1) {
// 线性插值
float frac = srcIndex - index;
float sample = samples[index] * (1.0f - frac) + samples[index + 1] * frac;
resampledSamples.push_back(sample);
} else if (index < inputSampleCount) {
resampledSamples.push_back(samples[index]);
}
}
samples = std::move(resampledSamples);
inputSampleCount = samples.size();
}
// 第四步:转换为目标格式
QByteArray outputData;
if (outputFormat.sampleFormat() == QAudioFormat::Int16) {
outputData.resize(inputSampleCount * 2);
int16_t* intData = reinterpret_cast<int16_t*>(outputData.data());
for (int i = 0; i < inputSampleCount; i++) {
// 限制范围并转换为16位整数
float sample = std::max(-1.0f, std::min(1.0f, samples[i]));
intData[i] = static_cast<int16_t>(sample * 32767.0f);
}
} else if (outputFormat.sampleFormat() == QAudioFormat::Float) {
outputData.resize(inputSampleCount * sizeof(float));
float* floatData = reinterpret_cast<float*>(outputData.data());
for (int i = 0; i < inputSampleCount; i++) {
floatData[i] = samples[i];
}
}
//qDebug() << "音频转换完成,输出大小:" << outputData.size() << "字节";
return outputData;
}
void SpeechTestMainWindow::startKWS() {
if (isKWSActive) {
return;
}
// 检查是否正在进行其他音频操作,如果是则自动停止
if (isRecording) {
qDebug() << "KWS启动自动停止ASR麦克风识别";
stopMicRecognition();
kwsResultEdit->append("🔄 自动停止ASR麦克风识别以启动语音唤醒");
}
if (isRecordingWav) {
qDebug() << "KWS启动自动停止录音功能";
stopRecording();
kwsResultEdit->append("🔄 自动停止录音功能以启动语音唤醒");
}
qDebug() << "开始语音唤醒检测";
// 获取默认音频设备
QAudioDevice defaultDevice = QMediaDevices::defaultAudioInput();
qDebug() << "语音唤醒设备:" << defaultDevice.description();
// 使用默认配置:设备首选格式
kwsAudioFormat = defaultDevice.preferredFormat();
qDebug() << "KWS使用默认格式 - 采样率:" << kwsAudioFormat.sampleRate()
<< "声道:" << kwsAudioFormat.channelCount()
<< "格式:" << static_cast<int>(kwsAudioFormat.sampleFormat());
// 检查KWS管理器是否已初始化
if (!kwsManager->isInitialized()) {
QMessageBox::critical(this, "错误", "KWS模型未初始化请检查模型配置");
return;
}
// 创建KWS检测器
kwsSpotter = kwsManager->createKeywordSpotter();
if (!kwsSpotter) {
QMessageBox::critical(this, "错误", "无法创建KWS关键词检测器");
return;
}
// 创建KWS流
kwsStream = kwsManager->createKeywordStream(kwsSpotter);
if (!kwsStream) {
QMessageBox::critical(this, "错误", "无法创建KWS关键词流");
kwsManager->destroyKeywordSpotter(kwsSpotter);
kwsSpotter = nullptr;
return;
}
qDebug() << "KWS检测器和流创建成功";
// 创建音频源 - 优化缓冲区设置
kwsAudioSource = new QAudioSource(defaultDevice, kwsAudioFormat, this);
kwsAudioSource->setBufferSize(16384); // 增大缓冲区,减少音频丢失
kwsAudioSource->setVolume(1.0);
// 启动音频输入
kwsAudioDevice = kwsAudioSource->start();
if (!kwsAudioDevice) {
QMessageBox::critical(this, "错误", "无法启动语音唤醒音频输入");
delete kwsAudioSource;
kwsAudioSource = nullptr;
return;
}
// 创建定时器处理音频数据 - 优化处理频率
kwsTimer = new QTimer(this);
connect(kwsTimer, &QTimer::timeout, this, &SpeechTestMainWindow::processKWSData);
kwsTimer->start(30); // 30ms处理一次更频繁的处理提高识别率
isKWSActive = true;
kwsStartBtn->setEnabled(false);
kwsStopBtn->setEnabled(true);
kwsStartBtn->setText("唤醒检测中...");
kwsResultEdit->clear();
kwsResultEdit->append("🎤 语音唤醒检测已启动");
kwsResultEdit->append("⚙️ 音频配置:默认格式 → 16kHz单声道");
// 尝试读取关键词文件
QString keywordsPath = QDir::homePath() + "/.config/QSmartAssistant/Data/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/keywords.txt";
QFile keywordsFile(keywordsPath);
kwsResultEdit->append("📋 支持的关键词:");
if (keywordsFile.open(QIODevice::ReadOnly | QIODevice::Text)) {
QTextStream in(&keywordsFile);
QString line;
int lineCount = 0;
while (!in.atEnd() && lineCount < 8) { // 显示前8个关键词
line = in.readLine().trimmed();
if (!line.isEmpty() && !line.startsWith("#")) {
kwsResultEdit->append(QString(" • %1").arg(line));
lineCount++;
}
}
keywordsFile.close();
if (lineCount == 0) {
kwsResultEdit->append(" • 小米小米");
kwsResultEdit->append(" • 小爱同学");
kwsResultEdit->append(" • 你好问问");
}
} else {
kwsResultEdit->append(" • 小米小米");
kwsResultEdit->append(" • 小爱同学");
kwsResultEdit->append(" • 你好问问");
}
kwsResultEdit->append("🎯 等待关键词检测...");
kwsResultEdit->append("⚙️ 优化配置:阈值=0.25 (提高识别率)");
kwsResultEdit->append("💡 提示:发音要清晰标准,现在更容易检测");
statusBar()->showMessage("语音唤醒检测运行中");
qDebug() << "KWS启动完成";
}
void SpeechTestMainWindow::stopKWS() {
if (!isKWSActive) {
return;
}
isKWSActive = false;
// 停止音频输入
if (kwsAudioSource) {
kwsAudioSource->stop();
delete kwsAudioSource;
kwsAudioSource = nullptr;
}
// 停止定时器
if (kwsTimer) {
kwsTimer->stop();
delete kwsTimer;
kwsTimer = nullptr;
}
// 清理KWS资源
if (kwsStream) {
kwsManager->destroyKeywordStream(kwsStream);
kwsStream = nullptr;
qDebug() << "KWS关键词流已销毁";
}
if (kwsSpotter) {
kwsManager->destroyKeywordSpotter(kwsSpotter);
kwsSpotter = nullptr;
qDebug() << "KWS关键词检测器已销毁";
}
kwsStartBtn->setEnabled(true);
kwsStopBtn->setEnabled(false);
kwsStartBtn->setText("开始语音唤醒");
kwsResultEdit->append("🛑 语音唤醒检测已停止");
kwsResultEdit->append("📊 KWS资源已清理完成");
statusBar()->showMessage("语音唤醒检测已停止");
qDebug() << "语音唤醒检测已停止,资源已清理";
}
void SpeechTestMainWindow::processKWSData() {
if (!kwsAudioDevice || !isKWSActive || !kwsStream || !kwsSpotter) {
return;
}
// 读取音频数据
QByteArray audioData = kwsAudioDevice->readAll();
if (audioData.isEmpty()) {
return;
}
// 定义目标格式16kHz单声道
QAudioFormat targetFormat;
targetFormat.setSampleRate(16000);
targetFormat.setChannelCount(1);
targetFormat.setSampleFormat(QAudioFormat::Float);
// 转换音频格式为16kHz单声道
QByteArray convertedData = convertAudioFormat(audioData, kwsAudioFormat, targetFormat);
if (convertedData.isEmpty()) {
return;
}
// 转换后的数据是16kHz单声道浮点格式
const float* samples = reinterpret_cast<const float*>(convertedData.data());
int sampleCount = convertedData.size() / sizeof(float);
// 分块发送音频数据,提高处理效果
const int chunkSize = 1600; // 100ms的音频数据 (16000 * 0.1)
for (int i = 0; i < sampleCount; i += chunkSize) {
int currentChunkSize = std::min(chunkSize, sampleCount - i);
kwsManager->acceptWaveform(kwsStream, samples + i, currentChunkSize);
// 每个块都检查是否准备好解码
while (kwsManager->isReady(kwsStream, kwsSpotter)) {
kwsManager->decode(kwsStream, kwsSpotter);
// 立即检查结果
QString detectedKeyword = kwsManager->getResult(kwsStream, kwsSpotter);
if (!detectedKeyword.isEmpty()) {
static int successCount = 0;
successCount++;
qDebug() << "🎯 KWS检测到关键词:" << detectedKeyword << "(第" << successCount << "次)";
kwsResultEdit->append(QString("🎯 检测到关键词: %1 (第%2次)")
.arg(detectedKeyword).arg(successCount));
statusBar()->showMessage(QString("🎯 检测到关键词: %1 (总计%2次)")
.arg(detectedKeyword).arg(successCount));
// 重置流以继续检测
kwsManager->reset(kwsStream, kwsSpotter);
return; // 检测到关键词后立即返回
}
}
}
// 简化的调试信息
static int callCount = 0;
callCount++;
if (callCount % 100 == 0) { // 减少调试输出频率
// 计算音频电平
float maxLevel = 0.0f;
for (int i = 0; i < std::min(sampleCount, 1000); i++) {
maxLevel = std::max(maxLevel, std::abs(samples[i]));
}
qDebug() << "KWS处理:" << callCount << "次,样本数:" << sampleCount
<< "电平:" << maxLevel << "阈值:0.25";
if (maxLevel > 0.02f) {
statusBar()->showMessage(QString("检测中... (电平: %1)")
.arg(maxLevel, 0, 'f', 3));
}
}
}