深度神经网络中的Batch Normalization介绍及实现
? ? ? 之前在經典網絡DenseNet介紹_fengbingchun的博客-CSDN博客_densenet中介紹DenseNet時,網絡中會有BN層,即Batch Normalization,在每個Dense Block中都會有BN參與運算,下面對BN進行介紹并給出C++和PyTorch實現。
? ? ? Batch Normalization即批量歸一化由Sergey loffe等人于2015年提出,論文名為:《Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift》,論文見:https://arxiv.org/pdf/1502.03167.pdf 。
? ? ? Batch Normalization是一種算法方法,它使深度神經網絡的訓練更快、更穩定。它可在激活函數前也可在激活函數后進行。它依賴于batch size,當batch size較小時,性能退化嚴重。在訓練和測試階段,它的計算方式不同。
? ? ? 對于CNN,使用BN更好;對于RNN,使用LN(Layer Normalization)更好。
? ? ? 在訓練過程中,由于每層輸入的分布隨著前一層的參數發生變化而發生變化,因此訓練深度神經網絡很復雜。由于需要較低的學習率和仔細的參數初始化,這會減慢訓練速度,并且使得訓練具有飽和非線性的模型變得非常困難。我們將這種現象稱為內部協變量偏移(internal covariate shift),并通過歸一化層輸入來解決該問題。
? ? ? Batch Normalization用于訓練小批量樣本(mini-batch)。它允許我們使用更高的學習率,并且不必太小心初始化。它還充當正則化器,在某些情況下消除了Dropout的需要。
? ? ? Batch Normalization實現算法如下,截圖來自原始論文:
? ? ? 在一個mini-batch中,在每一個BN層中,對每個樣本的同一通道,計算它們的均值和方差,再對數據進行歸一化,歸一化到平均值為0,標準差為1 的常態分布,最后使用兩個可學習參數gamma和beta對歸一化的數據進行縮放和移位。此外,在訓練過程中還保存了每個mini-batch每一BN層的均值和方差,最后求所有mini-batch均值和方差的期望值,以此來作為推理過程中該BN層的均值和方差。
? ? ? Batch Normalization優點:
? ? ? (1).在不影響收斂性的情況下,可使用更大的學習率,使訓練更快、更穩定;
? ? ? (2).具有正則化效果,防止過擬合,可去除Dropout和局部響應歸一化(Local Response Normalization, LRN);
? ? ? (3).由于訓練數據打亂順序,使得每個epoch中mini-batch都不一樣,對不同mini-batch做歸一化可以起到數據增強的效果;
? ? ? (4).緩減梯度爆炸和梯度消失。
? ? ? 以下是C++實現:
? ? ? batch_normalization.hpp:
#ifndef FBC_SRC_NN_BATCH_NORM_HPP_
#define FBC_SRC_NN_BATCH_NORM_HPP_#include <vector>
#include <memory>
#include <algorithm>namespace ANN {class BatchNorm {
public:BatchNorm(int number, int channels, int height, int width) : number_(number), channels_(channels), height_(height), width_(width){mean_.resize(channels_);std::fill(mean_.begin(), mean_.end(), 0.);variance_.resize(channels_);std::fill(variance_.begin(), variance_.end(), 0.);}int LoadData(const float* data, int length);std::unique_ptr<float []> Run();void SetGamma(float gamma) { gamma_ = gamma; }float GetGamma() const { return gamma_; }void SetBeta(float beta) { beta_ = beta; }float GetBeta() const { return beta_; }void SetMean(std::vector<float> mean) { mean_ = mean; }std::vector<float> GetMean() const { return mean_; }void SetVariance(std::vector<float> variance) { variance_ = variance; }std::vector<float> GetVariance() const { return variance_; }void SetEpsilon(float epsilon) { epsilon_ = epsilon; }private:int number_; // mini-batchint channels_;int height_;int width_;std::vector<float> mean_;std::vector<float> variance_;float gamma_ = 1.; // 縮放 float beta_ = 0.; // 平移float epsilon_ = 1e-5; // small positive value to avoid zero-divisionstd::vector<float> data_;
};} // namespace ANN#endif // FBC_SRC_NN_BATCH_NORM_HPP_
? ? ? batch_normalization.cpp:
#include "batch_normalization.hpp"
#include <string.h>
#include <vector>
#include <cmath>
#include "common.hpp"namespace ANN {int BatchNorm::LoadData(const float* data, int length)
{CHECK(number_ * channels_ * height_ * width_ == length);data_.resize(length);memcpy(data_.data(), data, length * sizeof(float));return 0;
}std::unique_ptr<float[]> BatchNorm::Run()
{int spatial_size = height_ * width_;for (int n = 0; n < number_; ++n) {int offset = n * (channels_ * spatial_size);for (int c = 0; c < channels_; ++c) {const float* p = data_.data() + offset + (c * spatial_size);for (int k = 0; k < spatial_size; ++k) {mean_[c] += *p++;}}}std::transform(mean_.begin(), mean_.end(), mean_.begin(), [=](float_t x) { return x / (number_ * spatial_size); });for (int n = 0; n < number_; ++n) {int offset = n * (channels_ * spatial_size);for (int c = 0; c < channels_; ++c) {const float* p = data_.data() + offset + (c * spatial_size);for (int k = 0; k < spatial_size; ++k) {variance_[c] += std::pow(*p++ - mean_[c], 2.);}}}std::transform(variance_.begin(), variance_.end(), variance_.begin(), [=](float_t x) { return x / (std::max(1., number_*spatial_size*1.)); });std::vector<float> stddev(channels_);for (int c = 0; c < channels_; ++c) {stddev[c] = std::sqrt(variance_[c] + epsilon_);}std::unique_ptr<float[]> output(new float[number_ * channels_ * spatial_size]);for (int n = 0; n < number_; ++n) {const float* p1 = data_.data() + n * (channels_ * spatial_size);float* p2 = output.get() + n * (channels_ * spatial_size);for (int c = 0; c < channels_; ++c) {for (int k = 0; k < spatial_size; ++k) {*p2++ = (*p1++ - mean_[c]) / stddev[c];}}}return output;
}} // namespace ANN
? ? ? funset.cpp:
int test_batch_normalization()
{const std::vector<float> data = { 11.1, -2.2, 23.3, 54.4, 58.5, -16.6,-97.7, -28.8, 49.9, -61.3, 52.6, -33.9,-2.45, -15.7, 72.4, 9.1, 47.2, 21.7};const int number = 3, channels = 1, height = 1, width = 6;ANN::BatchNorm bn(number, channels, height, width);bn.LoadData(data.data(), data.size());std::unique_ptr<float[]> output = bn.Run();fprintf(stdout, "result:\n");for (int n = 0; n < number; ++n) {const float* p = output.get() + n * (channels * height * width);for (int c = 0; c < channels; ++c) {for (int h = 0; h < height; ++h) {for (int w = 0; w < width; ++w) {fprintf(stdout, "%f, ", p[c * (height * width) + h * width + w]);}fprintf(stdout, "\n");}}}return 0;
}
? ? ? 執行結果如下:
?? ? ? 以下是調用PyTorch接口實現:源碼來自于https://zh.d2l.ai/chapter_convolutional-modern/batch-norm.html
import torch
from torch import nn
import numpy as np# reference: https://zh.d2l.ai/chapter_convolutional-modern/batch-norm.html
# BatchNorm reimplementation
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):# 通過is_grad_enabled來判斷當前模式是訓練模式還是預測模式if not torch.is_grad_enabled():# 如果是在預測模式下,直接使用傳入的移動平均所得的均值和方差X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)else:assert len(X.shape) in (2, 4)if len(X.shape) == 2:# 使用全連接層的情況,計算特征維上的均值和方差mean = X.mean(dim=0)var = ((X - mean) ** 2).mean(dim=0)else:# 使用二維卷積層的情況,計算通道維上(axis=1)的均值和方差。# 這里我們需要保持X的形狀以便后面可以做廣播運算mean = X.mean(dim=(0, 2, 3), keepdim=True)var = ((X - mean) ** 2).mean(dim=(0, 2, 3), keepdim=True)# 訓練模式下,用當前的均值和方差做標準化X_hat = (X - mean) / torch.sqrt(var + eps)# 更新移動平均的均值和方差moving_mean = momentum * moving_mean + (1.0 - momentum) * meanmoving_var = momentum * moving_var + (1.0 - momentum) * varY = gamma * X_hat + beta # 縮放和移位return Y, moving_mean.data, moving_var.dataclass BatchNorm(nn.Module):# num_features:完全連接層的輸出數量或卷積層的輸出通道數。# num_dims:2表示完全連接層,4表示卷積層def __init__(self, num_features, num_dims):super().__init__()if num_dims == 2:shape = (1, num_features)else:shape = (1, num_features, 1, 1)# 參與求梯度和迭代的拉伸和偏移參數,分別初始化成1和0self.gamma = nn.Parameter(torch.ones(shape))self.beta = nn.Parameter(torch.zeros(shape))# 非模型參數的變量初始化為0和1self.moving_mean = torch.zeros(shape)self.moving_var = torch.ones(shape)def forward(self, X):# 如果X不在內存上,將moving_mean和moving_var復制到X所在顯存上if self.moving_mean.device != X.device:self.moving_mean = self.moving_mean.to(X.device)self.moving_var = self.moving_var.to(X.device)# 保存更新過的moving_mean和moving_varY, self.moving_mean, self.moving_var = batch_norm(X, self.gamma, self.beta, self.moving_mean,self.moving_var, eps=1e-5, momentum=0.9)return Y# N = 3, C = 1, H = 1, W = 6
data = [[[[11.1, -2.2, 23.3, 54.4, 58.5, -16.6]]],[[[-97.7, -28.8, 49.9, -61.3, 52.6, -33.9]]],[[[-2.45, -15.7, 72.4, 9.1, 47.2, 21.7]]]]
input = torch.FloatTensor(data) # [N, C, H, W]
print("input shape:", input.shape)model = BatchNorm(1, 2)
output = model(input)
print("output:", output)print("test finish")
? ? ? 執行結果如下:可見,C++和PyTorch實現結果相同
? ? ? ?以下是調用tiny-dnn接口的測試代碼:
int test_dnn_batch_normalization()
{const std::vector<float> data = { 11.1, -2.2, 23.3, 54.4, 58.5, -16.6,-97.7, -28.8, 49.9, -61.3, 52.6, -33.9,-2.45, -15.7, 72.4, 9.1, 47.2, 21.7 };const int number = 3, channels = 1, height = 1, width = 6;const int spatial_size = height * width;tiny_dnn::tensor_t in_data(number), out_data(number);for (int n = 0; n < number; ++n) {in_data[n].resize(spatial_size * channels);out_data[n].resize(spatial_size * channels);int offset = n * (spatial_size * channels);memcpy(in_data[n].data(), data.data() + offset, sizeof(float)*spatial_size*channels);std::fill(out_data[n].begin(), out_data[n].end(), 0.);}std::vector<tiny_dnn::tensor_t*> in(1), out(1);in[0] = &in_data;out[0] = &out_data;tiny_dnn::batch_normalization_layer bn(spatial_size, channels);bn.forward_propagation(in, out);fprintf(stdout, "tiny_dnn result:\n");for (int n = 0; n < number; ++n) {for (int s = 0; s < spatial_size * channels; ++s)fprintf(stdout, "%f ", out_data[n][s]);fprintf(stdout, "\n");}return 0;
}
? ? ? ?執行結果如下:與上面的C++和PyTorch代碼結果若有不同,原因是tiny-dnn源碼中math_functions.h文件在求平均方差時除數為num_examples*spatial_dim-1.0f,而不是num_examples*spatial_dim
?? ? ? GitHub:
? ? ? https://github.com/fengbingchun/NN_Test
? ? ? https://github.com/fengbingchun/PyTorch_Test
總結
以上是生活随笔為你收集整理的深度神经网络中的Batch Normalization介绍及实现的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Conan客户端简单使用示例
- 下一篇: 通过Windows10上的VS Code