1
0
mirror of https://github.com/f4exb/sdrangel.git synced 2026-04-07 07:35:38 -04:00

Freq Scanner voice squelch: revamping of voice detection

This commit is contained in:
f4exb 2026-02-17 00:23:43 +01:00
parent 642a6da8aa
commit e782848642
3 changed files with 313 additions and 219 deletions

View File

@ -484,13 +484,15 @@ void FreqScanner::processScanResults(const QDateTime& fftStartTime, const QList<
// Find first frequency in list above threshold
for (int i = 0; i < m_scanResults.size(); i++)
{
frequencySettings = m_settings.getFrequencySettings(m_scanResults[i].m_frequency);
int j = m_settings.m_voiceSquelchType == FreqScannerSettings::VoiceSquelchType::VoiceLsb ?
m_scanResults.size()-1 - i : i;
frequencySettings = m_settings.getFrequencySettings(m_scanResults[j].m_frequency);
Real threshold = m_settings.getThreshold(frequencySettings);
if ((m_scanResults[i].m_power >= threshold)
&& checkVoiceThreshold(m_settings.m_voiceSquelchType, m_scanResults[i].m_voiceActivityLevel, m_settings.m_voiceSquelchThreshold))
if ((m_scanResults[j].m_power >= threshold)
&& checkVoiceThreshold(m_settings.m_voiceSquelchType, m_scanResults[j].m_voiceActivityLevel, m_settings.m_voiceSquelchThreshold))
{
frequency = m_scanResults[i].m_frequency;
frequency = m_scanResults[j].m_frequency;
activeFrequencySettings = frequencySettings;
break;
}

View File

@ -18,6 +18,11 @@
#include <QDebug>
#include <complex.h>
#include <cmath>
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
#include "dsp/dspengine.h"
#include "dsp/fftfactory.h"
@ -113,7 +118,17 @@ void FreqScannerSink::processOneSample(Complex &ci)
qint64 startFrequency = m_centerFrequency - m_scannerSampleRate / 2;
qint64 diff = frequency - startFrequency;
float binBW = m_scannerSampleRate / (float)m_fftSize;
// qDebug() << "FreqScannerSink::processOneSample:"
// << "startFrequency" << startFrequency
// << "m_scannerSampleRate" << m_scannerSampleRate
// << "m_centerFrequency" << m_centerFrequency
// << "m_fftSize" << m_fftSize
// << "frequency" << frequency
// << "diff" << diff
// << "binBW" << binBW
// << "m_binsPerChannel" << m_binsPerChannel;
// avoid spectrum edges where there may be aliasing from half-band filters
if ((diff >= m_scannerSampleRate / 8) && (diff < m_scannerSampleRate * 7 / 8))
{
int bin = std::round(diff / binBW);
@ -127,9 +142,9 @@ void FreqScannerSink::processOneSample(Complex &ci)
Real voiceLevel = 0.0;
if (m_settings.m_voiceSquelchType == FreqScannerSettings::VoiceLsb) {
voiceLevel = voiceActivityLevel(bin, channelBins, true);
voiceLevel = voiceActivityLevel(frequency, bin, channelBins, true);
} else if (m_settings.m_voiceSquelchType == FreqScannerSettings::VoiceUsb) {
voiceLevel = voiceActivityLevel(bin, channelBins, false);
voiceLevel = voiceActivityLevel(frequency, bin, channelBins, false);
}
if (voiceLevel > 0.0) {
@ -282,6 +297,83 @@ Real FreqScannerSink::magSqFromRawFFT(int bin) const
return magSq(fftBin);
}
// Compute formant envelope using log-domain spectral smoothing
// This separates vocal tract resonances (formants) from pitch harmonics
void FreqScannerSink::getFormantEnvelope(int startBin, int endBin, QVector<Real>& envelope) const
{
if (startBin < 0 || endBin >= m_fftSize || startBin > endBin) {
envelope.clear();
return;
}
int numBins = endBin - startBin + 1;
envelope.resize(numBins);
// Step 1: Compute log magnitude spectrum
QVector<Real> logMag(numBins);
Real minLog = -10.0; // Floor to avoid log(0)
for (int i = 0; i < numBins; i++)
{
Real magSq = magSqFromRawFFT(startBin + i);
Real mag = std::sqrt(std::max(magSq, (Real)1e-12));
logMag[i] = std::log(mag);
if (logMag[i] < minLog) {
logMag[i] = minLog;
}
}
// Step 2: Apply smoothing in log domain to extract formant envelope
// This removes rapid variations (pitch harmonics) while keeping slow variations (formants)
// Use moving average filter with window size based on expected harmonic spacing
float binBW = m_scannerSampleRate / (float)m_fftSize;
// For formant detection, smooth over approximately 200-300 Hz
// This is wider than typical harmonic spacing (80-300 Hz) but narrower than formant bandwidth
float smoothingBandwidthHz = 250.0; // Hz
int smoothingWindow = std::max(3, (int)(smoothingBandwidthHz / binBW));
// Make smoothing window odd for symmetry
if (smoothingWindow % 2 == 0) {
smoothingWindow++;
}
int halfWindow = smoothingWindow / 2;
// qDebug() << "FreqScannerSink::getFormantEnvelope smoothingWindow:" << smoothingWindow
// << "bins (" << (smoothingWindow * binBW) << "Hz)";
// Apply moving average smoothing
QVector<Real> smoothedLog(numBins);
for (int i = 0; i < numBins; i++)
{
Real sum = 0.0;
int count = 0;
for (int j = -halfWindow; j <= halfWindow; j++)
{
int idx = i + j;
if (idx >= 0 && idx < numBins) {
sum += logMag[idx];
count++;
}
}
smoothedLog[i] = (count > 0) ? (sum / count) : logMag[i];
}
// Step 3: Convert back to linear magnitude
for (int i = 0; i < numBins; i++)
{
envelope[i] = std::exp(smoothedLog[i]);
}
// Debug: print first few envelope values
// qDebug() << "FreqScannerSink::getFormantEnvelope first 10 envelope values:"
// << envelope.mid(0, std::min(10, numBins));
}
void FreqScannerSink::applyChannelSettings(int channelSampleRate, int channelFrequencyOffset, int scannerSampleRate, int fftSize, int binsPerChannel, bool force)
{
qDebug() << "FreqScannerSink::applyChannelSettings:"
@ -354,12 +446,12 @@ void FreqScannerSink::applySettings(const FreqScannerSettings& settings, const Q
}
// Voice activity detection for SSB signals
// Detects voice by looking for formant-like structure (broad spectral peaks)
// Detects voice by looking for formant-like structure using spectral smoothing
// Returns a value from 0.0 (no voice) to 1.0 (strong voice signature)
Real FreqScannerSink::voiceActivityLevel(int bin, int channelBins, bool isLSB) const
Real FreqScannerSink::voiceActivityLevel(qint64 freq, int bin, int channelBins, bool isLSB) const
{
// Voice band in SSB is typically 100-3000 Hz from carrier
// We look for 2-4 formant peaks with bandwidth 50-200 Hz each
// We look for 2-4 formant peaks in the smoothed spectral envelope
int startBin = bin - channelBins / 2 + 1;
int endBin = startBin + channelBins - 1;
@ -372,244 +464,243 @@ Real FreqScannerSink::voiceActivityLevel(int bin, int channelBins, bool isLSB) c
float binBW = m_scannerSampleRate / (float)m_fftSize;
// For LSB, spectrum is reversed - flip the search direction
int step = isLSB ? -1 : 1;
int searchStart = isLSB ? endBin : startBin;
int searchEnd = isLSB ? startBin : endBin;
int carrierBin = isLSB ? endBin : startBin;
// Find peaks above noise floor
QVector<int> peakBins;
QVector<int> broadPeakBins;
QVector<Real> peakMags;
// Get formant envelope using spectral smoothing
// This separates vocal tract resonances from pitch harmonics
QVector<Real> formantEnvelope;
getFormantEnvelope(startBin, endBin, formantEnvelope);
// Calculate average noise floor from raw FFT
if (formantEnvelope.isEmpty()) {
qDebug() << "FreqScannerSink::voiceActivityLevel formantEnvelope is empty!";
return 0.0;
}
// Calculate noise floor from formant envelope
Real noiseFloor = 0.0;
int noiseCount = 0;
for (int i = startBin; i <= endBin; i++) {
noiseFloor += magSqFromRawFFT(i);
noiseCount++;
Real maxEnv = 0.0;
for (int i = 0; i < formantEnvelope.size(); i++) {
noiseFloor += formantEnvelope[i];
maxEnv = std::max(maxEnv, formantEnvelope[i]);
}
noiseFloor = (noiseCount > 0) ? (noiseFloor / noiseCount) : 1e-12;
Real threshold = noiseFloor * 3.0; // 4.77 dB above noise
noiseFloor = formantEnvelope.size() > 0 ? noiseFloor / formantEnvelope.size() : 1e-12;
// For voice, we need reasonably high peaks relative to noise
// Use 4 dB (2.5x) above average as threshold for peak detection
// Additional validation: peak-to-noise ratio
// If max is not sufficiently higher than average, signal quality is poor
float peakToNoiseRatio = maxEnv / noiseFloor;
// qDebug() << "FreqScannerSink::voiceActivityLevel: noiseFloor:" << noiseFloor
// << "maxEnv:" << maxEnv << "peak-to-noise ratio:" << peakToNoiseRatio;
// Lower threshold: require at least 1.2x peak-to-noise ratio (only ~1.6dB)
// After heavy smoothing, formants appear as gentle bumps, not sharp peaks
if (peakToNoiseRatio < 1.2) {
// Signal is essentially all noise
// qDebug() << "FreqScannerSink::voiceActivityLevel: Rejected - insufficient signal (peak-to-noise:" << peakToNoiseRatio << ")";
return 0.0;
}
// For peak detection, use much lower threshold since smoothed spectral peaks are gentle
// Use 1.1x noise floor to catch formant peaks in the smoothed envelope
Real threshold = noiseFloor * 1.1;
// Simple peak detection using raw FFT data
int i = searchStart;
while ((isLSB && i >= searchEnd) || (!isLSB && i <= searchEnd))
// Find formant peaks in the smoothed envelope
QVector<int> formantBins;
QVector<Real> formantMags;
// Simple peak detection in formant envelope
for (int i = 1; i < formantEnvelope.size() - 1; i++)
{
Real binMagSq = magSqFromRawFFT(i);
if (binMagSq > threshold)
{
// Found potential peak start
Real peakMag = binMagSq;
int peakBin = i;
Real prev = formantEnvelope[i - 1];
Real curr = formantEnvelope[i];
Real next = formantEnvelope[i + 1];
// Find local maximum
i += step;
while ((isLSB && i >= searchEnd) || (!isLSB && i <= searchEnd))
// Local maximum above threshold
if (curr > prev && curr > next && curr > threshold)
{
int absBin = startBin + i;
float freqOffset = std::abs(absBin - carrierBin) * binBW;
// Only include peaks within SSB voice bandwidth (100-3000 Hz from carrier)
if (freqOffset >= 100.0 && freqOffset <= 3000.0)
{
Real nextMagSq = magSqFromRawFFT(i);
if (nextMagSq > peakMag) {
peakMag = nextMagSq;
peakBin = i;
i += step;
} else if (nextMagSq > threshold) {
i += step;
} else {
break; // Peak ended
}
}
// Calculate frequency offset from carrier
// For USB: carrier is at startBin, voice extends upward
// For LSB: carrier is at endBin, voice extends downward
int carrierBin = isLSB ? endBin : startBin;
float freqOffset = std::abs(peakBin - carrierBin) * binBW;
// Only include peaks within SSB voice bandwidth (0-3000 Hz from carrier)
if (freqOffset <= 3000.0) {
peakBins.append(peakBin);
peakMags.append(peakMag);
formantBins.append(absBin);
formantMags.append(curr);
}
}
i += step;
}
if (peakBins.size() < 2) {
return 0.0; // Need at least 2 peaks for voice
// Voice requires 2-4 formants
if (formantBins.size() < 2) {
// qDebug() << "FreqScannerSink::voiceActivityLevel: Not enough peaks detected" << formantBins.size();
return 0.0;
}
// Measure peak bandwidths
int broadPeakCount = 0;
for (int p = 0; p < peakBins.size(); p++)
// qDebug() << "FreqScannerSink::voiceActivityLevel: Detected" << formantBins.size() << "peaks above threshold";
// Sort formants by frequency offset (not bin order) to handle LSB reversal
QVector<float> formantFreqs;
QVector<int> formantIndices;
for (int i = 0; i < formantBins.size(); i++)
{
int peakBin = peakBins[p];
Real peakMag = peakMags[p];
Real halfPower = peakMag * 0.5; // 3dB point
// Measure bandwidth at half power (-3dB)
int bwCount = 1; // Peak bin itself
// Search left
for (int j = peakBin - 1; j >= startBin; j--) {
if (magSqFromRawFFT(j) > halfPower) {
bwCount++;
} else {
break;
}
}
// Search right
for (int j = peakBin + 1; j <= endBin; j++) {
if (magSqFromRawFFT(j) > halfPower) {
bwCount++;
} else {
break;
}
}
float bandwidth = bwCount * binBW;
// Voice formants are typically 50-200 Hz wide
// CW signals are <50 Hz wide
if (bandwidth >= 50.0 && bandwidth <= 200.0)
float freqOffset = std::abs(formantBins[i] - carrierBin) * binBW;
formantFreqs.append(freqOffset);
formantIndices.append(i);
}
// Simple insertion sort by frequency
for (int i = 1; i < formantFreqs.size(); i++)
{
for (int j = i; j > 0 && formantFreqs[j] < formantFreqs[j - 1]; j--)
{
broadPeakBins.append(peakBin);
broadPeakCount++;
std::swap(formantFreqs[j], formantFreqs[j - 1]);
std::swap(formantIndices[j], formantIndices[j - 1]);
}
}
// Merge peaks that are too close together (within 400 Hz)
// Real formants have minimum separation of 400-500 Hz in SSB voice
// So anything closer is ripple within a single formant
// We keep the peak with highest magnitude and remove others
QVector<float> mergedFormantFreqs;
QVector<int> mergedFormantIndices;
const float minFormantSpacing = 400.0; // Hz - minimum spacing between real formants
for (int i = 0; i < formantFreqs.size(); i++)
{
if (i == 0 || formantFreqs[i] - mergedFormantFreqs.back() >= minFormantSpacing)
{
// This is a new formant (far enough from previous)
mergedFormantFreqs.append(formantFreqs[i]);
mergedFormantIndices.append(formantIndices[i]);
}
else
{
// This peak is too close to the previous one - merge by keeping highest magnitude
int prevIdx = mergedFormantIndices.back();
int currIdx = formantIndices[i];
if (formantMags[currIdx] > formantMags[prevIdx])
{
// Replace with higher magnitude peak
mergedFormantFreqs.back() = formantFreqs[i];
mergedFormantIndices.back() = currIdx;
}
}
}
formantFreqs = mergedFormantFreqs;
formantIndices = mergedFormantIndices;
// qDebug() << "FreqScannerSink::voiceActivityLevel: Merged to" << formantFreqs.size() << "formants:" << formantFreqs;
// After merging, we should still have at least 2 formants for voice
if (formantFreqs.size() < 2) {
// qDebug() << "FreqScannerSink::voiceActivityLevel: Not enough formants after merging";
return 0.0;
}
// Check formant spacing (voice formants are typically 500-1500 Hz apart)
// Check formant spacing (voice formants should be 400-1500 Hz apart)
// F1-F2 spacing is typically 600-1200 Hz
bool goodSpacing = false;
if (broadPeakCount >= 2)
for (int i = 0; i < formantFreqs.size() - 1; i++)
{
for (int p = 0; p < broadPeakBins.size() - 1; p++)
{
int spacing = std::abs(broadPeakBins[p + 1] - broadPeakBins[p]);
float spacingHz = spacing * binBW;
if (spacingHz >= 400.0 && spacingHz <= 1100.0) {
goodSpacing = true;
break;
}
float spacing = formantFreqs[i + 1] - formantFreqs[i];
if (spacing >= 400.0 && spacing <= 1500.0) {
goodSpacing = true;
break;
}
}
// Calculate voice activity score
// 2-4 broad peaks with good spacing = strong voice signature
if (!goodSpacing) {
// qDebug() << "FreqScannerSink::voiceActivityLevel: Bad formant spacing" << formantFreqs;
return 0.0; // Formants too close or too far apart
}
// Check for F1 formant in expected range (300-1000 Hz from carrier)
// This is critical for voice detection
// F1 should be the lowest frequency formant
bool hasF1 = false;
Real f1Mag = 0.0;
int f1Idx = -1;
if (formantFreqs.size() > 0 && formantFreqs[0] >= 300.0 && formantFreqs[0] <= 1000.0)
{
hasF1 = true;
f1Idx = formantIndices[0];
f1Mag = formantMags[f1Idx];
}
if (!hasF1) {
// qDebug() << "FreqScannerSink::voiceActivityLevel: No valid F1 formant, lowest freq:" << (formantFreqs.size() > 0 ? formantFreqs[0] : -1);
return 0.0; // No F1 formant - not voice or mistuned
}
// Check for F2 formant in expected range (900-2500 Hz from carrier)
// F2 should be higher frequency than F1 AND 400-1500 Hz away from F1
bool hasF2 = false;
Real f2Mag = 0.0;
int f2Idx = -1;
float f2Freq = 0.0;
for (int i = 1; i < formantFreqs.size(); i++)
{
float freqOffset = formantFreqs[i];
float f1ToF2Spacing = freqOffset - formantFreqs[0];
// F2 must be: in range, higher than F1, and properly spaced from F1
if (freqOffset >= 900.0 && freqOffset <= 2500.0 &&
f1ToF2Spacing >= 400.0 && f1ToF2Spacing <= 1500.0)
{
hasF2 = true;
f2Idx = formantIndices[i];
f2Freq = freqOffset;
f2Mag = formantMags[f2Idx];
break; // Take the first valid F2
}
}
if (!hasF2) {
// qDebug() << "FreqScannerSink::voiceActivityLevel: No valid F2 formant, candidates not in range/spacing";
return 0.0; // No F2 formant - not voice or mistuned
}
// Calculate voice activity score based on formant characteristics
// Voice is indicated by presence of F1 and F2 formants - this is the primary voice signature
// Harmonics are less reliable in SSB due to spectral properties and noise
float score = 0.0;
if (broadPeakCount >= 2)
{
// Detect fundamental frequency (f0) by looking for harmonic structure
// Voice has harmonics at f0, 2*f0, 3*f0, etc. with formants modulating them
// Typical f0: male 85-180 Hz, female 165-255 Hz
int carrierBin = isLSB ? endBin : startBin;
// Base score from number of formants (2-4 formants typical for voice)
// Formants are the most reliable voice indicator
// Use merged formant count, not raw peak count
float formantScore = std::min(formantFreqs.size() / 3.0f, 1.0f);
score += formantScore * 0.6; // 60% weight
// Try different f0 candidates in typical voice range (80-300 Hz)
int maxHarmonics = 0;
int bestF0Hz = 0;
// Score from formant magnitude (strong formants = strong voice)
// Higher magnitudes indicate clearer voice detection
// Make this threshold appropriately high to prefer strong signals
float magnitudeScore = std::min((float)(f1Mag + f2Mag) / (float)(noiseFloor * 6.0), 1.0f);
score += magnitudeScore * 0.4; // 40% weight
for (int f0Hz = 80; f0Hz <= 300; f0Hz += 10)
{
int f0Bins = (int)(f0Hz / binBW);
int harmonicCount = 0;
// Clamp to [0, 1]
score = std::max(0.0f, std::min(score, 1.0f));
// Check for harmonics up to 3000 Hz (SSB bandwidth limit)
for (int h = 1; h <= 10; h++)
{
int harmonicBin = carrierBin + (isLSB ? -1 : 1) * (h * f0Bins);
// Check if any detected peak is near this harmonic (within ±30 Hz tolerance)
int tolerance = (int)(30.0 / binBW);
for (int p = 0; p < peakBins.size(); p++)
{
if (std::abs(peakBins[p] - harmonicBin) <= tolerance)
{
harmonicCount++;
break;
}
}
// Stop checking beyond 3 kHz
if (h * f0Hz > 3000) {
break;
}
}
if (harmonicCount > maxHarmonics)
{
maxHarmonics = harmonicCount;
bestF0Hz = f0Hz;
}
}
// Need at least 3 harmonics aligned to confirm voice pitch structure
// If mistuned by 1 kHz, formants won't align with any harmonic series
if (maxHarmonics < 3) {
return 0.0;
}
// Check if first harmonic (f0) is actually present at low frequency
// When tuned correctly, f0 should be 80-300 Hz from carrier
int f0Bins = (int)(bestF0Hz / binBW);
int firstHarmonicBin = carrierBin + (isLSB ? -1 : 1) * f0Bins;
int tolerance = (int)(30.0 / binBW);
bool hasF0 = false;
for (int p = 0; p < peakBins.size(); p++)
{
if (std::abs(peakBins[p] - firstHarmonicBin) <= tolerance)
{
hasF0 = true;
break;
}
}
// If no peak near fundamental frequency, signal is mistuned
// (harmonics may still align but they're higher harmonics of wrong pitch)
if (!hasF0) {
return 0.0;
}
// Verify at least one broad formant in F1 range (300-1000 Hz from carrier)
// This catches low-tuning errors where harmonics might still align
bool hasF1 = false;
for (int p = 0; p < broadPeakBins.size(); p++)
{
float freqOffset = std::abs(broadPeakBins[p] - carrierBin) * binBW;
if (freqOffset >= 300.0 && freqOffset <= 1000.0) {
hasF1 = true;
break;
}
}
if (!hasF1) {
return 0.0; // No F1 formant - signal is mistuned
}
// Base score from number of broad peaks
score = std::min(broadPeakCount / 4.0f, 1.0f);
// Boost if spacing is good
if (goodSpacing) {
score = std::min(score * 1.5f, 1.0f);
}
// Boost if strong harmonic structure (4+ harmonics)
if (maxHarmonics >= 4) {
score = std::min(score * 1.2f, 1.0f);
}
// Penalize if too many narrow peaks (likely CW or noise)
// => This condition is ALWAYS true as there are always many more peaks than broad peaks
// int narrowPeakCount = peakBins.size() - broadPeakCount;
// if (narrowPeakCount > broadPeakCount) {
// score *= 0.5;
// }
// Just strongly penalize if there are no broad peaks at all
if (broadPeakCount == 0) {
score *= 0.1;
}
if (score > m_settings.m_voiceSquelchThreshold) {
// qDebug() << "FreqScannerSink::voiceActivityLevel Sorted formant frequencies:" << formantFreqs;
qDebug() << "FreqScannerSink::voiceActivityLevel: Voice activity detection (formant-based):"
<< "freq:" << freq << "Hz"
<< "formants:" << formantFreqs.size() << "(merged from" << formantBins.size() << "peaks)"
<< "F1:" << (hasF1 ? QString::number(formantFreqs[0], 'f', 0) : "none") << "Hz"
<< "F2:" << (hasF2 ? QString::number(f2Freq, 'f', 0) : "none") << "Hz"
<< "spacing:" << (hasF2 ? QString::number(f2Freq - formantFreqs[0], 'f', 0) : "none") << "Hz"
<< "peak-to-noise:" << (maxEnv / noiseFloor)
<< "noiseFloor:" << noiseFloor << "threshold:" << threshold
<< "f1Mag:" << f1Mag << "f2Mag:" << f2Mag
<< "score:" << score;
}
return score;
}

View File

@ -83,7 +83,8 @@ private:
Real peakPower(int bin, int channelBins) const;
Real magSq(int bin) const;
Real magSqFromRawFFT(int bin) const;
Real voiceActivityLevel(int bin, int channelBins, bool isLSB) const;
Real voiceActivityLevel(qint64 freq, int bin, int channelBins, bool isLSB) const;
void getFormantEnvelope(int startBin, int endBin, QVector<Real>& envelope) const;
};
#endif // INCLUDE_FREQSCANNERSINK_H