Freesound General-Purpose Audio Tagging Challenge
Freesound是一个由Creative Commons许可的声音组成的协作数据库。本次竞赛的目的是将包含乐器、人类、动物、机器等真实世界声音的音频文件进行分类。一些标签如:小号,吱吱,喵,掌声和手指闷响。其中一个挑战是,并非所有标签都是手动验证的。一个创造性的解决方案应该能够部分地依赖于这些弱注解。
通过这篇文章,我们可以对于数据的可视化和模型搭建方法有一个了解。
# 将其更改为True以复现结果
COMPLETE_RUN = False
import numpy as np
np.random.seed(1001)
import os
import shutil
import IPython
import matplotlib
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook
from sklearn.model_selection import StratifiedKFold
%matplotlib inline
matplotlib.style.use('ggplot')
train = pd.read_csv("../freesound-audio-tagging/train.csv")
test = pd.read_csv("../freesound-audio-tagging/sample_submission.csv")
train.head()
train.label.unique()
category_group = train.groupby(['label', 'manually_verified']).count()
plot = category_group.unstack().reindex(category_group.unstack().sum(axis=1).sort_values().index)\
.plot(kind='bar', stacked=True, title="每个类别音频样例的数量", figsize=(16,10))
plot.set_xlabel("类别")
plot.set_ylabel("样例数")
print('所有类别中最少样本 = ', min(train.label.value_counts()))
print('所有类别中最多样本', max(train.label.value_counts()))
我们发现:
94
个,最大数量为300
个maually_verified
标签的比例是不统一的import IPython.display as ipd # To play sound in the notebook
fname = '../freesound-audio-tagging/audio_train/' + '00044347.wav' # Hi-hat
ipd.Audio(fname)
# 使用wave库
import wave
wav = wave.open(fname)
print("采样率(帧) = ", wav.getframerate())
print("总样本(帧) = ", wav.getnframes())
print("持续时间 = ", wav.getnframes()/wav.getframerate())
# 使用scipy
from scipy.io import wavfile
rate, data = wavfile.read(fname)
print("采样率(帧) = ", rate)
print("总样本(帧) = ", data.shape)
print(data)
让我们画出音频帧
plt.plot(data, '-', );
让我们将视野移动到前500帧
plt.figure(figsize=(16, 4))
plt.plot(data[:500], '.'); plt.plot(data[:500], '-');
我们现在要分析数据集中音频文件的长度
train['nframes'] = train['fname'].apply(lambda f: wave.open('../freesound-audio-tagging/audio_train/' + f).getnframes())
test['nframes'] = test['fname'].apply(lambda f: wave.open('../freesound-audio-tagging/audio_test/' + f).getnframes())
_, ax = plt.subplots(figsize=(16, 4))
sns.violinplot(ax=ax, x="label", y="nframes", data=train)
plt.xticks(rotation=90)
plt.title('每个标签的音频帧分布', fontsize=16)
plt.show()
我们发现:
现在让我们分析一下训练与测试中的帧长分布。
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16,5))
train.nframes.hist(bins=100, ax=axes[0])
test.nframes.hist(bins=100, ax=axes[1])
plt.suptitle('训练和测试中的帧长分布', ha='center', fontsize='large');
我们发现:
异常长度
。让我们来分析它们(省略)。abnormal_length = [707364, 353682, 138474, 184338]
for length in abnormal_length:
abnormal_fnames = test.loc[test.nframes == length, 'fname'].values
print("帧长 = ", length, " 文件数量 = ", abnormal_fnames.shape[0], end=" ")
fname = np.random.choice(abnormal_fnames)
print("播放 ", fname)
IPython.display.display(ipd.Audio( '../freesound-audio-tagging/audio_test/' + fname))
我们将建立两个模型:
我们的模型的架构如下所示:
重要提示:由于kaggle内核的时间限制,不可能执行大型模型的10倍训练。我在本地培训了模型,并将其输出文件作为数据集上传。如果您希望训练更大的模型,请在内核开头时更改 COMPLETE_RUN = True
。
一些重要的包导入
import librosa
import soundfile as sf #PySoundFile 0.9.0.post1
import numpy as np
import scipy
from tensorflow.keras import losses, models, optimizers
from tensorflow.keras.activations import relu, softmax
from tensorflow.keras.callbacks import (EarlyStopping, LearningRateScheduler,
ModelCheckpoint, TensorBoard, ReduceLROnPlateau)
from tensorflow.keras.layers import (Convolution1D, Dense, Dropout, GlobalAveragePooling1D,
GlobalMaxPool1D, Input, MaxPool1D, concatenate)
from tensorflow.keras.utils import Sequence, to_categorical
Configuration对象存储那些在数据生成器、模型和训练函数之间共享的学习参数。就训练而言,任何global
的内容都可以成为Configuration对象的一部分。
class Config(object):
def __init__(self,
sampling_rate=16000, audio_duration=2, n_classes=41,
use_mfcc=False, n_folds=10, learning_rate=0.0001,
max_epochs=50, n_mfcc=20):
self.sampling_rate = sampling_rate
self.audio_duration = audio_duration
self.n_classes = n_classes
self.use_mfcc = use_mfcc
self.n_mfcc = n_mfcc
self.n_folds = n_folds
self.learning_rate = learning_rate
self.max_epochs = max_epochs
self.audio_length = self.sampling_rate * self.audio_duration
if self.use_mfcc:
self.dim = (self.n_mfcc, 1 + int(np.floor(self.audio_length/512)), 1)
else:
self.dim = (self.audio_length, 1)
DataGenerator类继承自keras.utils.Sequence
。这对于预处理数据并将其提供给Keras模型非常有用。
一旦用batch_size初始化,它就会计算一个epoch中的批处理数量。__len__
方法告诉Keras在每个epoch中绘制多少批次。
__getItem__
方法入参为索引(即批号),并在计算偏移量后返回一批数据(X和y)。在测试时间期间,只返回X
。
如果我们想在每个epoch之后执行一些操作(比如洗牌数据,或者增加增加的数据的比例),我们可以使用on_epoch_end
方法。
注意:Sequence
是进行多处理的一种更安全的方式。这种结构保证了网络在每个epoch对每个样本只进行一次训练,这与生成器的情况不同。
class DataGenerator(Sequence):
def __init__(self, config, data_dir, list_IDs, labels=None,
batch_size=64, preprocessing_fn=lambda x: x):
self.config = config
self.data_dir = data_dir
self.list_IDs = list_IDs
self.labels = labels
self.batch_size = batch_size
self.preprocessing_fn = preprocessing_fn
self.on_epoch_end()
self.dim = self.config.dim
def __len__(self):
return int(np.ceil(len(self.list_IDs) / self.batch_size))
def __getitem__(self, index):
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
list_IDs_temp = [self.list_IDs[k] for k in indexes]
return self.__data_generation(list_IDs_temp)
def on_epoch_end(self):
self.indexes = np.arange(len(self.list_IDs))
def __data_generation(self, list_IDs_temp):
cur_batch_size = len(list_IDs_temp)
X = np.empty((cur_batch_size, *self.dim))
input_length = self.config.audio_length
for i, ID in enumerate(list_IDs_temp):
file_path = self.data_dir + ID
# Read and Resample the audio
try:
data, _ = librosa.core.load(file_path, sr=self.config.sampling_rate,
res_type='kaiser_fast')
except:
data = []
# DATA,SR = sf.read(file_path, channels=2, samplerate=48000,dtype=np.float32,
# subtype='PCM_32',format="RAW",endian='LITTLE')
# data = librosa.resample(DATA,SR,target_sr=self.config.sampling_rate,res_type='kaiser_fast')
# DATA,SR = librosa.load(file_path,sr=44100,mono=False,dtype=np.float32)
# data = librosa.resample(DATA,SR,self.config.sampling_rate)
# Random offset / Padding
if len(data) > input_length:
max_offset = len(data) - input_length
offset = np.random.randint(max_offset)
data = data[offset:(input_length+offset)]
else:
if input_length > len(data):
max_offset = input_length - len(data)
offset = np.random.randint(max_offset)
else:
offset = 0
data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
# Normalization + Other Preprocessing
if self.config.use_mfcc:
data = librosa.feature.mfcc(data, sr=self.config.sampling_rate,
n_mfcc=self.config.n_mfcc)
data = np.expand_dims(data, axis=-1)
else:
data = self.preprocessing_fn(data)[:, np.newaxis]
X[i,] = data
if self.labels is not None:
y = np.empty(cur_batch_size, dtype=int)
for i, ID in enumerate(list_IDs_temp):
y[i] = self.labels[ID]
return X, to_categorical(y, num_classes=self.config.n_classes)
else:
return X
归一化是预处理的关键步骤。最简单的方法是缩放特征的范围,以缩放[0,1]中的范围。
def audio_norm(data):
max_data = np.max(data)
min_data = np.min(data)
data = (data-min_data)/(max_data-min_data+1e-6)
return data-0.5
def get_1d_dummy_model(config):
nclass = config.n_classes
input_length = config.audio_length
inp = Input(shape=(input_length,1))
x = GlobalMaxPool1D()(inp)
out = Dense(nclass, activation=softmax)(x)
model = models.Model(inputs=inp, outputs=out)
opt = optimizers.Adam(config.learning_rate)
model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
return model
def get_1d_conv_model(config):
nclass = config.n_classes
input_length = config.audio_length
inp = Input(shape=(input_length,1))
x = Convolution1D(16, 9, activation=relu, padding="valid")(inp)
x = Convolution1D(16, 9, activation=relu, padding="valid")(x)
x = MaxPool1D(16)(x)
x = Dropout(rate=0.1)(x)
x = Convolution1D(32, 3, activation=relu, padding="valid")(x)
x = Convolution1D(32, 3, activation=relu, padding="valid")(x)
x = MaxPool1D(4)(x)
x = Dropout(rate=0.1)(x)
x = Convolution1D(32, 3, activation=relu, padding="valid")(x)
x = Convolution1D(32, 3, activation=relu, padding="valid")(x)
x = MaxPool1D(4)(x)
x = Dropout(rate=0.1)(x)
x = Convolution1D(256, 3, activation=relu, padding="valid")(x)
x = Convolution1D(256, 3, activation=relu, padding="valid")(x)
x = GlobalMaxPool1D()(x)
x = Dropout(rate=0.2)(x)
x = Dense(64, activation=relu)(x)
x = Dense(1028, activation=relu)(x)
out = Dense(nclass, activation=softmax)(x)
model = models.Model(inputs=inp, outputs=out)
opt = optimizers.Adam(config.learning_rate)
model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
return model
将原始标签转换为整数索引是很重要的
LABELS = list(train.label.unique())
label_idx = {label: i for i, label in enumerate(LABELS)}
train.set_index("fname", inplace=True)
test.set_index("fname", inplace=True)
train["label_idx"] = train.label.apply(lambda x: label_idx[x])
if not COMPLETE_RUN:
train = train[:2000]
test = test[:2000]
config = Config(sampling_rate=16000, audio_duration=2, n_folds=10, learning_rate=0.001)
if not COMPLETE_RUN:
config = Config(sampling_rate=100, audio_duration=1, n_folds=2, max_epochs=1)
以下是10折训练的代码:
我们使用sklearn.cross_validation.stratifiedkfold,将训练数据分成10个折叠。
我们使用一些Keras回调来监控训。
ModelCheckPoint保存了我们模型的最佳权重(使用验证数据)。我们使用这个权重来进行测试预测。
一旦验证损失停止减少,提前停止训练。
Tensorboard帮助我们可视化训练和验证损失和准确性。
我们使用用于训练和验证的分拆的DataGenerator
来拟合模型。
我们得到训练和测试的检测结果,并将它们保存为.npy格式。我们还生成了一份提交文件。对于10折交叉验证而言,预测文件的数量应该为10。我们稍后将把这些预测集合起来。。
PREDICTION_FOLDER = "predictions_1d_conv"
if not os.path.exists(PREDICTION_FOLDER):
os.mkdir(PREDICTION_FOLDER)
if os.path.exists('logs/' + PREDICTION_FOLDER):
shutil.rmtree('logs/' + PREDICTION_FOLDER)
skf = StratifiedKFold(n_splits=config.n_folds).split(train,train.label_idx)
for i, (train_split, val_split) in enumerate(skf):
train_set = train.iloc[train_split]
val_set = train.iloc[val_split]
print(f"train set shape {train_set.shape} ; val set shape {val_set.shape}")
checkpoint = ModelCheckpoint('best_%d.h5'%i, monitor='val_loss', verbose=1, save_best_only=True)
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
tb = TensorBoard(log_dir='./logs/' + PREDICTION_FOLDER + '/fold_%d'%i, write_graph=True)
callbacks_list = [checkpoint, early, tb]
print("Fold: ", i)
print("#"*50)
if COMPLETE_RUN:
model = get_1d_conv_model(config)
else:
model = get_1d_dummy_model(config)
train_generator = DataGenerator(config, '../freesound-audio-tagging/audio_train/', train_set.index,
train_set.label_idx, batch_size=64,
preprocessing_fn=audio_norm)
val_generator = DataGenerator(config, '../freesound-audio-tagging/audio_train/', val_set.index,
val_set.label_idx, batch_size=64,
preprocessing_fn=audio_norm)
history = model.fit_generator(train_generator, callbacks=callbacks_list, validation_data=val_generator,
epochs=config.max_epochs, use_multiprocessing=True, workers=6, max_queue_size=20)
model.load_weights('best_%d.h5'%i)
# Save train predictions
train_generator = DataGenerator(config, '../freesound-audio-tagging/audio_train/', train.index, batch_size=128,
preprocessing_fn=audio_norm)
predictions = model.predict_generator(train_generator, use_multiprocessing=True,
workers=6, max_queue_size=20, verbose=1)
np.save(PREDICTION_FOLDER + "/train_predictions_%d.npy"%i, predictions)
# Save test predictions
test_generator = DataGenerator(config, '../freesound-audio-tagging/audio_test/', test.index, batch_size=128,
preprocessing_fn=audio_norm)
predictions = model.predict_generator(test_generator, use_multiprocessing=False,
workers=6, max_queue_size=20, verbose=1)
np.save(PREDICTION_FOLDER + "/test_predictions_%d.npy"%i, predictions)
# Make a submission file
top_3 = np.array(LABELS)[np.argsort(-predictions, axis=1)[:, :3]]
predicted_labels = [' '.join(list(x)) for x in top_3]
test['label'] = predicted_labels
test[['label']].to_csv(PREDICTION_FOLDER + "/predictions_%d.csv"%i)
现在我们已经训练了我们的模型,是时候平均这10折(这里是2折)的预测结果了。我们将尝试几何平均,看看我们的公共LB分数是多少。
pred_list = []
for i in range(2):
pred_list.append(np.load(PREDICTION_FOLDER +"/test_predictions_%d.npy"%i))
prediction = np.ones_like(pred_list[0])
for pred in pred_list:
prediction = prediction*pred
prediction = prediction**(1./len(pred_list))
# Make a submission file
top_3 = np.array(LABELS)[np.argsort(-prediction, axis=1)[:, :3]]
predicted_labels = [' '.join(list(x)) for x in top_3]
test = pd.read_csv('../freesound-audio-tagging/sample_submission.csv')
test['label'] = predicted_labels
test[['fname', 'label']].to_csv("1d_conv_ensembled_submission.csv", index=False)