深度学习vad人声检测之标签制作
深度學(xué)習(xí)幾乎滲透到了各行各業(yè),最火熱的莫過于視覺算法。然而,音頻相關(guān)的很多處理算法也逐漸被深度學(xué)習(xí)所浸潤,vad作為音頻前處理的一個操作得到了很廣泛的應(yīng)用,比較典型的vad檢測算法是通過提取特征,構(gòu)造高斯模型得到每段音頻的概率來確認(rèn)是人聲還是噪聲(包含了靜默);盡管傳統(tǒng)vad通過合理的設(shè)置參數(shù)也能取得可觀的效果,但是在當(dāng)下數(shù)據(jù)驅(qū)動時代,深度學(xué)習(xí)的效果要比傳統(tǒng)vad算法勝任的多,當(dāng)然這個前提是模型訓(xùn)練數(shù)據(jù)要足夠的豐富。
下面從深度學(xué)習(xí)的角度解析vad算法的實現(xiàn),基于深度學(xué)習(xí)的vad實現(xiàn)并不困難,關(guān)鍵的部分就是準(zhǔn)備數(shù)據(jù)階段,網(wǎng)絡(luò)搭建和損失函數(shù)的設(shè)計相對容易,因為vad實質(zhì)上就是一個分類問題,和語音喚醒類似,只是vad只需要做二分類,而語音喚醒需要多分類。具體步驟如下:
(1)數(shù)據(jù)標(biāo)簽的制作
(2)加噪數(shù)據(jù)的合成
(3)特征提取
(4)搭建網(wǎng)絡(luò)模型
(5)模型訓(xùn)練
(1)數(shù)據(jù)標(biāo)簽的制作
由于真實的音頻數(shù)據(jù)直接打標(biāo)簽的人工成本太大,這里我們采用錄制干凈的數(shù)據(jù)(即沒有噪聲只有人聲和靜默的音頻)進(jìn)行標(biāo)簽制作,對于干凈的數(shù)據(jù)可以直接采用能量進(jìn)行人聲檢測。標(biāo)簽制作代碼如下:
audio_tools.py
import numpy as np from scipy.io import wavfiledef add_wgn(s,var=1e-4):"""Add white Gaussian noise to signalIf no variance is given, simply add jitter. Jitter helps eliminate all-zero values."""np.random.seed(0)noise = np.random.normal(0,var,len(s))return s + noisedef read_wav(filename):"""read wav file.Normalizes signal to values between -1 and 1.Also add some jitter to remove all-zero segments."""fs, s = wavfile.read(filename) # scipy reads ints = np.array(s)/float(max(abs(s)))s = add_wgn(s) # Add jitter for numerical stabilityreturn fs,s#=============================================================================== import math def enframe(x, win_len, hop_len):"""receives a 1D numpy array and divides it into frames.outputs a numpy matrix with the frames on the rows."""x = np.squeeze(x)if x.ndim != 1:raise TypeError("enframe input must be a 1-dimensional array.")n_frames = 1 + np.int(math.ceil((len(x) - win_len) / float(hop_len)))x_framed = np.zeros((n_frames, win_len))padlen = int((n_frames - 1) * hop_len + win_len)zeros = np.zeros((padlen - len(x),))padsignal = np.concatenate((x, zeros))for i in range(n_frames):#print('i = ',i)x_framed[i] = padsignal[i * hop_len : i * hop_len + win_len]return x_frameddef deframe(x_framed, win_len, hop_len):'''interpolates 1D data with framed alignments into persample values.This function helps as a visual aid and can also be used to change frame-rate for features, e.g. energy, zero-crossing, etc.'''n_frames = len(x_framed)n_samples = n_frames*hop_len + win_lenx_samples = np.zeros((n_samples,1))for i in range(n_frames):x_samples[i*hop_len : i*hop_len + win_len] = x_framed[i]return x_samplesif __name__=='__main__':passunsupervised_vad.py
#! /usr/bin/python# Voice Activity Detection (VAD) tool. # use the vad_help() function for instructions. # Navid Shokouhi December 2012.# Updated: May 2017 for Speaker Recognition collaboration.from audio_tools import * import numpy as np import os##Function definitions: def vad_help():"""Voice Activity Detection (VAD) tool.Navid Shokouhi May 2017."""print("Usage:")print("python unsupervised_vad.py")#### Display tools def plot_this(s,title=''):""""""import pylabs = s.squeeze()if s.ndim ==1:pylab.plot(s)else:pylab.imshow(s,aspect='auto')pylab.title(title)pylab.show()def plot_these(s1,s2):import pylabtry:# If values are numpy arrayspylab.plot(s1/max(abs(s1)),color='red')pylab.plot(s2/max(abs(s2)),color='blue')except:# Values are listspylab.plot(s1,color='red')pylab.plot(s2,color='blue')pylab.legend()pylab.show()def plot_these1(s1,s2):import matplotlib.pyplot as plt#plt.ion()plt.figure(figsize = (16,9))try:# If values are numpy arraysplt.plot(s1/max(abs(s1)),color='red')plt.plot(s2/max(abs(s2)),color='blue')except:# Values are listsplt.plot(s1,color='red')plt.plot(s2,color='blue')plt.legend()plt.show()#plt.pause(2)#plt.close()#### Energy tools def zero_mean(xframes):"""remove mean of framed signalreturn zero-mean frames."""m = np.mean(xframes,axis=1)xframes = xframes - np.tile(m,(xframes.shape[1],1)).Treturn xframesdef compute_nrg(xframes):# calculate per frame energyn_frames = xframes.shape[1]return np.diagonal(np.dot(xframes,xframes.T))/float(n_frames)def compute_log_nrg(xframes):# calculate per frame energy in logn_frames = xframes.shape[1]raw_nrgs = np.log(compute_nrg(xframes+1e-5))/float(n_frames)return (raw_nrgs - np.mean(raw_nrgs))/(np.sqrt(np.var(raw_nrgs)))def power_spectrum(xframes):"""x: input signal, each row is one frame"""X = np.fft.fft(xframes,axis=1)X = np.abs(X[:,:X.shape[1]/2])**2return np.sqrt(X)def nrg_vad(xframes,percent_thr,nrg_thr=0.,context=5):"""Picks frames with high energy as determined by a user defined threshold.This function also uses a 'context' parameter toresolve the fluctuative nature of thresholding. context is an integer value determining the numberof neighboring frames that should be used to decideif a frame is voiced.The log-energy values are subject to mean and varnormalization to simplify the picking the right threshold. In this framework, the default threshold is 0.0"""xframes = zero_mean(xframes)n_frames = xframes.shape[0]# Compute per frame energies:xnrgs = compute_log_nrg(xframes)xvad = np.zeros((n_frames,1))for i in range(n_frames):start = max(i-context,0)end = min(i+context,n_frames-1)n_above_thr = np.sum(xnrgs[start:end]>nrg_thr)n_total = end-start+1xvad[i] = 1.*((float(n_above_thr)/n_total) > percent_thr)return xvaddef read_audio_file1(path,fmt,flag = 0):files = []names = []for root ,dir ,filenames in os.walk(path):#print('root = ',root)#print('filename.len = ',len(filenames))for filename in filenames:if filename.endswith(fmt):#print('filename = ',filename)file_path = root + '/' + filenamefiles.append(file_path)filename = filename.split('.')[0]if(flag == 1):name = file_path.split('.')[0]name = name.split('/')filename = name[-3] + '_' + name[-2] + '_' + name[-1]names.append(filename)return files,names# def max_filter(vads):# hist = 0 # win_len = 20 # half_len = int(win_len / 2) # vad_len = len(vads) # new_vads = [] # for i,vad in enumerate(vads): # if i < win_len: # new_vads.append(float(vad)) # continue# #if (i < vad_len - half_len) and vads[i] == 0: # if (i < vad_len - half_len): # for j in range(i - half_len,i + half_len): # hist = hist + vads[j] # if hist > half_len: # new_vads.append(1.) # else: # new_vads.append(0.) # else: # new_vads.append(float(vad))# hist = 0 # new_vads = np.array(new_vads) # new_vads = new_vads.reshape(len(new_vads),1)# return new_vadsdef max_filter(vads):hist = 0win_len = 20half_len = int(win_len / 2)vad_len = len(vads)new_vads = []wri_vads = []for i,vad in enumerate(vads):if i < win_len:new_vads.append(float(vad))wri_vads.append(int(vad))continue#if (i < vad_len - half_len) and vads[i] == 0:if (i < vad_len - half_len):for j in range(i - half_len,i + half_len):hist = hist + vads[j]if hist > half_len:new_vads.append(1.)wri_vads.append(1)else:new_vads.append(0.)wri_vads.append(0)else:new_vads.append(float(vad))wri_vads.append(int(vad))hist = 0new_vads = np.array(new_vads)new_vads = new_vads.reshape(len(new_vads),1)return new_vads,wri_vadsdef get_start_end_pts(new_vad):starts = []ends = []flags = 1for i,vad in enumerate(new_vad):if int(new_vad[0]) == 1 and flags == 1:starts.append(0)flags = 0if int(new_vad[i]) == 0 and int(new_vad[i + 1]) == 1:starts.append(i)elif int(new_vad[i]) == 1 and int(new_vad[i + 1]) == 0:ends.append(i)if i == len(new_vad) - 2:breakif int(new_vad[-2]) == 1 and int(new_vad[-1]) == 1:ends.append(len(new_vad) - 1)return starts,endsdef write_startEnd_info(filename,starts,ends):assert (len(starts) == len(ends))f = open(filename,'w')for start,end in zip(starts,ends):if start > end:print('==========start end info err========')f.write(str(start) + ' ' + str(end) + '\n')f.close()def write_frame_labels(fp,vads,name):fp.write(name + ':')for v in vads:fp.write(str(v) + ' ')fp.write('\n')return fpif __name__=='__main__':fs = 16000win_len = int(fs*0.025)hop_len = int(fs*0.010)files,names = read_audio_file1('../data/clean','.wav')label_p = '../data/speech_labels.txt'fp = open(label_p,'w')cnt = 0ratio = 0acc_sum = 0pos_sum = 0for file,name in zip(files,names):fs, s = read_wav(file)print('cnt = ',cnt)cnt = cnt + 1sframes = enframe(s,win_len,hop_len) # rows: frame index, cols: each frame#plot_this(compute_log_nrg(sframes))# percent_high_nrg is the VAD context ratio. It helps smooth the# output VAD decisions. Higher values are more strict.percent_high_nrg = 0.05vad = nrg_vad(sframes,percent_high_nrg)new_vad,wri_vad = max_filter(vad)plot_these1(deframe(new_vad, win_len, hop_len), s)#print('wri_vad.len = ',len(wri_vad))acc_sum = acc_sum + len(wri_vad)pos_sum = pos_sum + np.sum(wri_vad)#計算語音幀所占比例if cnt % 100 == 0:ratio = pos_sum / acc_sumprint('cnt = %d,acc_sum = %d,pos_sum = %d,ratio = %f'%(cnt,acc_sum,pos_sum,ratio))fp = write_frame_labels(fp,wri_vad,name)fp.close()print('ratio = ',ratio)這里只采用一段音頻顯示效果如下:
從上圖看到,該方法制作的標(biāo)簽還是很準(zhǔn)確的。
總結(jié)
以上是生活随笔為你收集整理的深度学习vad人声检测之标签制作的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: GeoJSON格式规范说明
- 下一篇: 预测和健康管理原则(PHM)