Description of v

0001 function [lev,xx] = v_activlevg(sp,fs,mode)
0002 %V_ACTIVLEVG Measure active speech level robustly [LEV,AF,FSO]=(sp,FS,MODE)
0003 %
0004 %Inputs: sp     is the speech signal
0005 %        FS     is the sample frequency in Hz (see also FSO below)
0006 %        MODE   is a combination of the following:
0007 %               r - raw omit input filters (default is 200 Hz to 5.5 kHz)
0008 %               0 - no high pass filter (i.e. include DC)
0009 %               4 - high pass filter at 40 Hz (but allows mains hum to pass)
0010 %               1 - use cheybyshev 1 filter
0011 %               2 - use chebyshev 2 filter (default)
0012 %               e - use elliptic filter
0013 %               h - omit low pass filter at 5.5 kHz
0014 %               d - give outputs in dB rather than power
0015 %               n - output a normalized speech signal as the first argument
0016 %               N - output a normalized filtered speech signal as the first argument
0017 %               l - give additional power level estimates (see below for details)
0018 %               a - include A-weighting filter
0019 %               i - include ITU-R-BS.468/ITU-T-J.16 weighting filter
0020 %
0021 %Outputs:
0022 %    If the "n" option is specified, a speech signal normalized to 0dB will be given as
0023 %    the first output followed by the other outputs.
0024 %        LEV    gives the speech level in units of power (or dB if mode='d')
0025 %               if mode='l' is specified, LEV is a row vector containing:
0026 %                       [ active-level mean-power mean-noise-power P.56-level harmonic-power-level]
0027 
0028 % This is an implementation of the algorithm described in [1].
0029 %
0030 % Refs:
0031 %    [1] S. Gonzalez and M. Brookes.
0032 %        Speech active level estimation in noisy conditions.
0033 %        In Proc. IEEE Intl Conf. Acoustics, Speech and Signal Processing,
0034 %        pp 6684�6688, Vancouver, May 2013. doi: 10.1109/ICASSP.2013.6638955.
0035 
0036 %      Copyright (C) Sira Gonzalez, Mike Brookes 2008-2012
0037 %      Version: $Id: v_activlevg.m 10865 2018-09-21 17:22:45Z dmb $
0038 %
0039 %   VOICEBOX is a MATLAB toolbox for speech processing.
0040 %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
0041 %
0042 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0043 %   This program is free software; you can redistribute it and/or modify
0044 %   it under the terms of the GNU General Public License as published by
0045 %   the Free Software Foundation; either version 2 of the License, or
0046 %   (at your option) any later version.
0047 %
0048 %   This program is distributed in the hope that it will be useful,
0049 %   but WITHOUT ANY WARRANTY; without even the implied warranty of
0050 %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0051 %   GNU General Public License for more details.
0052 %
0053 %   You can obtain a copy of the GNU General Public License from
0054 %   http://www.gnu.org/copyleft/gpl.html or by writing to
0055 %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
0056 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0057 persistent ro snr_ro offset c25zp c15zp e5zp
0058 if isempty(ro)
0059     ro = [0,0.124893,0.160062,0.212256,0.279864,0.351281,0.437356,...
0060         0.550314,0.680075,0.795741,0.892972,0.939087,1];  %  mapping from SNR to rho
0061     snr_ro = -2:0.5:4;
0062     offset = 0.8472; % correction factor for harmonic power in dB
0063     c25zp=[0.37843443673309i 0.23388534441447i; -0.20640255179496+0.73942185906851i -0.54036889596392+0.45698784092898i];
0064     c25zp=[[0; -0.66793268833792] c25zp conj(c25zp)];
0065     %       [c1z,c1p,c1k] = cheby1(5,0.25,1,'high','s');
0066     c15zp=[-0.659002835294875+1.195798636925079i -0.123261821596263+0.947463030958881i];
0067     c15zp=[zeros(1,5); -2.288586431066945 c15zp conj(c15zp)];
0068     %      [ez,ep,ek] = ellip(5,0.25,50,1,'high','s')
0069     e5zp=[0.406667680649209i 0.613849362744881i; -0.538736390607201+1.130245082677107i -0.092723126159100+0.958193646330194i];
0070     e5zp=[[0; -1.964538608244084]  e5zp conj(e5zp)];
0071     %    w=linspace(0.2,2,100);
0072     %    figure(1); plot(w,20*log10(abs(freqs(real(poly(c15zp(1,:))),real(poly(c15zp(2,:))),w)))); title('Chebyshev 1');
0073     %    figure(2); plot(w,20*log10(abs(freqs(real(poly(c25zp(1,:))),real(poly(c25zp(2,:))),w)))); title('Chebyshev 2');
0074     %    figure(3); plot(w,20*log10(abs(freqs(real(poly(e5zp(1,:))),real(poly(e5zp(2,:))),w)))); title('Elliptic');
0075 end
0076 if nargin<3
0077     mode=' ';
0078 end
0079 fso.ffs=fs;                           % sample frequency
0080 if any(mode=='r')                   % included for backward compatibility
0081     mode=['0h' mode];               % abolish both filters
0082 elseif fs<14000
0083     mode=['h' mode];               % abolish lowpass filter at low sample rates
0084 end
0085 % s-plane zeros and poles of high pass 5'th order filter -0.25dB at w=1 and -50dB stopband
0086 if any(mode=='1')
0087     szp=c15zp;            % Chebyshev 1
0088 elseif any(mode=='e')
0089     szp=e5zp;             % Elliptic
0090 else
0091     szp=c25zp;            % Chebyshev 2
0092 end
0093 % calculate high pass filter at 40 or 200 Hz
0094 if all(mode~='0')
0095     if any(mode=='4')
0096         fl=40;               % 40 Hz cutoff
0097     else
0098         fl=200;              % 200 Hz cutoff
0099     end
0100     zl=2./(1-szp*tan(fl*pi/fs))-1;      % 40 or 200 Hz LF limit
0101     al=real(poly(zl(2,:)));              % high pass filter
0102     bl=real(poly(zl(1,:)));
0103     sw=(-1).^(0:5)';                    %z^(-n) for z=-1
0104     bl=bl*(al*sw)/(bl*sw);           % scale to give HF gain of 1
0105 end
0106 % calculate low pass filter at 5500 Hz
0107 if all(mode~='h')
0108     zh=2./(szp/tan(5500*pi/fs)-1)+1;
0109     ah=real(poly(zh(2,:)));
0110     bh=real(poly(zh(1,:)));
0111     bh=bh*sum(ah)/sum(bh);
0112 end
0113 if any(mode=='a')
0114     [bw aw]=v_stdspectrum(2,'z',fs);
0115 elseif any(mode=='i')
0116     [bw aw]=v_stdspectrum(8,'z',fs);
0117 end
0118 % apply the input filters to the speech
0119 if all(mode~='0')
0120     sq=filter(bl,al,sp(:));     % highpass filter
0121 else
0122     sq=sp(:);
0123 end
0124 if all(mode~='h')
0125     sq=filter(bh,ah,sq(:));     % lowpass filter
0126 end
0127 if any(mode=='a') || any(mode=='i')
0128     sq=filter(bw,aw,sq(:));     % weighting filter
0129 end
0130 p56 = v_activlev(sq,fs,'0hl');            % get active level from P.56 method (no extra filters)
0131 [fx,dum,pv]=v_fxpefac(sp,fs);              % Estimate f0 and voiced probability from unfiltered speech
0132 [tz,f,S]=v_spgrambw(sq,fs,20,[0 5 fs/2],[],0.01);   % spectrogram with 20 Hz bandwidth and 5 Hz/10 ms resolution
0133 
0134 nh = 15;         % Number of harmonics to evaluate
0135 tv = find(pv>0.5);  % Find voiced frames
0136 nv=length(tv);      % number of voiced frames
0137 ran = -60:60;       % Frequency range of the mexican hat (+/- 60Hz)
0138 nm = length(ran);   % width of mexican hat
0139 o = 15;             % semi-width of central lobe
0140 mexic = (1-(ran/o).^2).*exp(-(ran.^2)/(2*o.^2)); % Mexican hat wavelet
0141 %% calculate harmonic energy in voiced frames
0142 Et=zeros(nv,1);
0143 for jv =1:nv                                    % loop for each voiced frame
0144     f_har = fx(tv(jv)).*(1:nh).';               % Calculate frequencies of harmonics
0145     lev = repmat(f_har,1,nm)+repmat(ran,nh,1);  % frequencies to test: nh x nm
0146     data = interp1(f,S(tv(jv),:),lev,'spline'); % interpolate spectrogram onto required frequencies
0147     Eh = sum(data.*repmat(mexic,nh,1),2);       % estimate power in each harmonic
0148     Et(jv) = sum(Eh(Eh>0));                     % sum the non-negative harmonic powers
0149 end
0150 har = mean(Et);                                 % mean power of voiced frames (not including offset)
0151 %% Noise estimate
0152 dp=v_estnoiseg(S,tz(2)-tz(1));                        % estimate noise power spectrum
0153 dpm=mean((f(2)-f(1))*sum(dp,2));                    % mean noise psd
0154 snr_est=10*log10(har/dpm);  % estimate global SNR (should this be restricted to voiced frames?)
0155 %% Combine methods
0156 
0157 levp56 = 10*log10(p56(1));
0158 levharm = 10*log10(har)+offset;
0159 if snr_est<=-2
0160     lev = levharm;
0161 elseif snr_est>4
0162     lev = levp56;
0163 else
0164     ro1 = interp1(snr_ro,ro,snr_est);               % interpolate the value of rho
0165     lev = ro1*levp56 + (1-ro1)*levharm;
0166 end
0167 levdb=lev;
0168 if any(mode=='l')
0169 %                       [ active-level mean-power mean-noise-power P.56-level mean-harmonic-power]
0170     lev=[levdb 10*log10(p56(2)) 10*log10(dpm) levp56 levharm];
0171 end
0172 if all(mode~='d')
0173     lev=10.^(0.1*lev);
0174 end
0175 if any(mode=='n') || any(mode=='N')
0176     xx=lev;
0177     if any(mode=='n')
0178         sq=sp;
0179     end
0180     if dpm>0
0181         lev=sq*10^(-0.05*levdb);
0182     else
0183         lev=sq;
0184     end
0185 end
v_activlevg

PURPOSE

SYNOPSIS

DESCRIPTION

CROSS-REFERENCE INFORMATION

SOURCE CODE