第一个CNN的分类器效果惨烈,有许多细节都没有注意到,感觉修改也令人心烦,从头开始。老实说我也不知道这次能不能成功,边做边看吧。
首先明确一下这次要做的事情:
# draw
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
# basic handling
import os
import glob
import pickle
import numpy as np
# audio
import librosa
import librosa.display
import IPython.display
# normalization
import sklearn
# nn
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import to_categorical
from keras.callbacks import LearningRateScheduler
keras.__version__
指定dataset的位置
parent_dir = '../data/UrbanSound8K/audio/'
train_dir = 'train/'
val_dir = 'val/'
test_dir = 'fold10/'
file_name = '*.wav'
train_files = glob.glob(os.path.join(parent_dir, train_dir, file_name))
val_files = glob.glob(os.path.join(parent_dir, val_dir, file_name))
test_files = glob.glob(os.path.join(parent_dir, test_dir, file_name))
定义一个函数用于读取音频片段,库里的片段几乎都是4s,但有一部分小于4秒,将它们补零。采样率22050,4秒一共88200个采样点。
def load_clip(filename):
x, sr = librosa.load(filename)
x = np.pad(x,(0,88200-x.shape[0]),'constant')
return x, sr
再定义一个函数,用于提取片段的mfcc并进行normalization。
def extract_feature(filename):
x, sr = load_clip(filename)
mfccs = librosa.feature.mfcc(y=x, sr=sr, n_mfcc=40)
norm_mfccs = sklearn.preprocessing.scale(mfccs, axis=1)
return norm_mfccs
# 测试extract_feature是否正常工作
mfccs = extract_feature('./1.wav')
plt.figure(figsize=(20,5))
librosa.display.specshow(mfccs, sr=22050, x_axis='time', cmap='viridis')
plt.colorbar()
plt.show()
print (mfccs.var(axis=1))
print (mfccs.mean(axis=1))
读取整个数据集,从整个数据集提取特征与标签
def load_dataset(filenames):
features, labels = np.empty((0,40,173)), np.empty(0)
cnt = 0;
cnt_all = len(filenames)
for filename in filenames:
mfccs = extract_feature(filename)
features = np.append(features,mfccs[None],axis=0)
cnt+=1
if(cnt%100==0):
print([str(cnt)+' / '+str(cnt_all)+' finished'])
labels = np.append(labels, filename.split('\\')[1].split('-')[1])
return np.array(features), np.array(labels, dtype=np.int)
将训练过程可视化的函数
def show_history(history):
print(history.history.keys())
fig = plt.figure(figsize=(20,5))
plt.subplot(121)
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.subplot(122)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='lower left')
plt.show()
如果还没有将音频转换为features,则进行转化并保存。
# train_x, train_y = load_dataset(train_files)
# pickle.dump(train_x, open('./train_x.dat', 'wb'))
# pickle.dump(train_y, open('./train_y.dat', 'wb'))
# val_x, val_y = load_dataset(val_files)
# pickle.dump(val_x, open('./val_x.dat', 'wb'))
# pickle.dump(val_y, open('./val_y.dat', 'wb'))
# test_x, test_y = load_dataset(test_files)
# pickle.dump(test_x, open('./test_x.dat', 'wb'))
# pickle.dump(test_y, open('./test_y.dat', 'wb'))
如果已经有features了,就从文件中读取。
train_x = pickle.load(open('./train_x.dat', 'rb'))
train_y = pickle.load(open('./train_y.dat', 'rb'))
val_x = pickle.load(open('./val_x.dat', 'rb'))
val_y = pickle.load(open('./val_y.dat', 'rb'))
test_x = pickle.load(open('./test_x.dat', 'rb'))
test_y = pickle.load(open('./test_y.dat', 'rb'))
接下来对feature进行一些预处理。mfcc是二维数据,要输入conv2D层,要把它变为3维数据。因为是聚类,所以把label转化为categories
train_x = train_x.reshape(train_x.shape[0],train_x.shape[1],train_x.shape[2],1)
val_x = val_x.reshape(val_x.shape[0],val_x.shape[1],val_x.shape[2],1)
test_x = test_x.reshape(test_x.shape[0],test_x.shape[1],test_x.shape[2],1)
train_y = to_categorical(train_y)
val_y = to_categorical(val_y)
test_y = to_categorical(test_y)
Pre-processing至此结束,检查一下我们的训练集
print(train_x.shape)
print(train_y.shape)
没有问题的话就开始搭建模型
model = Sequential()
model.add(Convolution2D(32, (3, 3), activation='relu',input_shape = train_x.shape[1:]))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.5))
model.add(Convolution2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(2, 2))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='Adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
model.summary(line_length=80)
history = model.fit(train_x, train_y, epochs=10, batch_size=32, validation_data=(val_x, val_y))
show_history(history)