當(dāng)前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

pytorch显存不足时的解决办法

發(fā)布時(shí)間：2023/12/8 编程问答 26 豆豆

生活随笔收集整理的這篇文章主要介紹了 pytorch显存不足时的解决办法小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

將float32轉(zhuǎn)化為float16，是最有效的降低顯存占用的方式，可以降低一半左右的顯存占用。
實(shí)現(xiàn)方式：首先在代碼的最前面加上

torch.set_default_dtype(torch.float16)

這行代碼將這個(gè)程序內(nèi)部所有的float變量轉(zhuǎn)化為float32。
此時(shí)如果直接運(yùn)行程序會(huì)出現(xiàn)輸入為float參數(shù)為floathalf的錯(cuò)誤報(bào)告。
然后將輸入改變?yōu)閒loathalf即可
代碼如下

inputs = inputs.type(torch.float16)

方法1不建議采用，在后面使用cuDNN加速時(shí)候容易報(bào)

RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED You can try to repro this exception using the following code snippet. If that doesn't trigger the error, please include your original repro script when reporting this issue.import torch torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = False torch.backends.cudnn.allow_tf32 = True data = torch.randn([4, 32, 119, 159], dtype=torch.half, device='cuda', requires_grad=True) net = torch.nn.Conv2d(32, 64, kernel_size=[5, 5], padding=[0, 0], stride=[2, 2], dilation=[1, 1], groups=1) net = net.cuda().half() out = net(data) out.backward(torch.randn_like(out)) torch.cuda.synchronize()ConvolutionParams data_type = CUDNN_DATA_HALFpadding = [0, 0, 0]stride = [2, 2, 0]dilation = [1, 1, 0]groups = 1deterministic = falseallow_tf32 = true input: TensorDescriptor 000001E834DE5180type = CUDNN_DATA_HALFnbDims = 4dimA = 4, 32, 119, 159, strideA = 605472, 18921, 159, 1, output: TensorDescriptor 000001E834DE3AC0type = CUDNN_DATA_HALFnbDims = 4dimA = 4, 64, 58, 78, strideA = 289536, 4524, 78, 1, weight: FilterDescriptor 000001E8349A6610type = CUDNN_DATA_HALFtensor_format = CUDNN_TENSOR_NCHWnbDims = 4dimA = 64, 32, 5, 5, Pointer addresses: input: 0000002363108000output: 00000023637A8800weight: 0000002305E01600 Additional pointer addresses: grad_output: 00000023637A8800grad_input: 0000002363108000 Backward data algorithm: 1

的錯(cuò)誤。
2. 利用pytorch的checkpoint特性，可以極大地降低顯存的使用。
實(shí)現(xiàn)方式：
在densenet中，官方給出了densenet的checkpoint實(shí)現(xiàn)，地址densenet可以通過在densenet參數(shù)中填入 memory_efficient=True來實(shí)現(xiàn)顯存的降低。
對(duì)于其它網(wǎng)絡(luò)，則可以通過

import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.checkpoint import checkpoint from torchvision.datasets.cifar import CIFAR10 import numpy as np from progressbar import progressbardef conv_bn_relu(in_ch, out_ch, ker_sz, stride, pad):return nn.Sequential(nn.Conv2d(in_ch, out_ch, ker_sz, stride, pad, bias=False),nn.BatchNorm2d(out_ch),nn.ReLU())class NetA(nn.Module):def __init__(self, use_checkpoint=False):super().__init__()self.use_checkpoint = use_checkpointk = 2# 32x32self.layer1 = conv_bn_relu(3, 32*k, 3, 1, 1)self.layer2 = conv_bn_relu(32*k, 32*k, 3, 2, 1)# 16x16self.layer3 = conv_bn_relu(32*k, 64*k, 3, 1, 1)self.layer4 = conv_bn_relu(64*k, 64*k, 3, 2, 1)# 8x8self.layer5 = conv_bn_relu(64*k, 128*k, 3, 1, 1)self.layer6 = conv_bn_relu(128*k, 128*k, 3, 2, 1)# 4x4self.layer7 = conv_bn_relu(128*k, 256*k, 3, 1, 1)self.layer8 = conv_bn_relu(256*k, 256*k, 3, 2, 1)# 1x1self.layer9 = nn.Linear(256*k, 10)def seg0(self, y):y = self.layer1(y)return ydef seg1(self, y):y = self.layer2(y)y = self.layer3(y)return ydef seg2(self, y):y = self.layer4(y)y = self.layer5(y)return ydef seg3(self, y):y = self.layer6(y)y = self.layer7(y)return ydef seg4(self, y):y = self.layer8(y)y = F.adaptive_avg_pool2d(y, 1)y = torch.flatten(y, 1)y = self.layer9(y)return ydef forward(self, x):y = x# y = self.layer1(y)y = y + torch.zeros(1, dtype=y.dtype, device=y.device, requires_grad=True)if self.use_checkpoint:# 使用 checkpointy = checkpoint(self.seg0, y)y = checkpoint(self.seg1, y)y = checkpoint(self.seg2, y)y = checkpoint(self.seg3, y)y = checkpoint(self.seg4, y)else:# 不使用 checkpointy = self.seg0(y)y = self.seg1(y)y = self.seg2(y)y = self.seg3(y)y = self.seg4(y)return yif __name__ == '__main__':net = NetA(use_checkpoint=True).cuda()train_dataset = CIFAR10('../datasets/cifar10', True, download=True)train_x = np.asarray(train_dataset.data, np.uint8)train_y = np.asarray(train_dataset.targets, np.int)losser = nn.CrossEntropyLoss()optim = torch.optim.Adam(net.parameters(), 1e-3)epoch = 10batch_size = 31batch_count = int(np.ceil(len(train_x) / batch_size))for e_id in range(epoch):print('epoch', e_id)print('training')net.train()loss_sum = 0for b_id in progressbar(range(batch_count)):optim.zero_grad()batch_x = train_x[batch_size*b_id: batch_size*(b_id+1)]batch_y = train_y[batch_size*b_id: batch_size*(b_id+1)]batch_x = torch.from_numpy(batch_x).permute(0, 3, 1, 2).float() / 255.batch_y = torch.from_numpy(batch_y).long()batch_x = batch_x.cuda()batch_y = batch_y.cuda()batch_x = F.interpolate(batch_x, (224, 224), mode='bilinear')y = net(batch_x)loss = losser(y, batch_y)loss.backward()optim.step()loss_sum += loss.item()print('loss', loss_sum / batch_count)with torch.no_grad():print('testing')net.eval()acc_sum = 0for b_id in progressbar(range(batch_count)):optim.zero_grad()batch_x = train_x[batch_size * b_id: batch_size * (b_id + 1)]batch_y = train_y[batch_size * b_id: batch_size * (b_id + 1)]batch_x = torch.from_numpy(batch_x).permute(0, 3, 1, 2).float() / 255.batch_y = torch.from_numpy(batch_y).long()batch_x = batch_x.cuda()batch_y = batch_y.cuda()batch_x = F.interpolate(batch_x, (224, 224), mode='bilinear')y = net(batch_x)y = torch.topk(y, 1, dim=1).indicesy = y[:, 0]acc = (y == batch_y).float().sum() / len(batch_x)acc_sum += acc.item()print('acc', acc_sum / batch_count)ids = np.arange(len(train_x))np.random.shuffle(ids)train_x = train_x[ids]train_y = train_y[ids]

這種方式添加。
但是在實(shí)際修改resnet的過程中，發(fā)現(xiàn)有時(shí)候修改了以后并不會(huì)減小顯存，但是過一段時(shí)間之后又可以減少，且自己實(shí)現(xiàn)的減小顯存的效率并沒有官方給的高。

import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.checkpoint import checkpointclass BasicBlock(nn.Module):"""Basic Block for resnet 18 and resnet 34"""#BasicBlock and BottleNeck block#have different output size#we use class attribute expansion#to distinctexpansion = 1def __init__(self, in_channels, out_channels, stride=1):super().__init__()#residual functionself.residual_function = nn.Sequential(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),nn.BatchNorm2d(out_channels),nn.ReLU(inplace=True),nn.Conv2d(out_channels, out_channels * BasicBlock.expansion, kernel_size=3, padding=1, bias=False),nn.BatchNorm2d(out_channels * BasicBlock.expansion))#shortcutself.shortcut = nn.Sequential()#the shortcut output dimension is not the same with residual function#use 1*1 convolution to match the dimensionif stride != 1 or in_channels != BasicBlock.expansion * out_channels:self.shortcut = nn.Sequential(nn.Conv2d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False),nn.BatchNorm2d(out_channels * BasicBlock.expansion))def forward(self, x):return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x)) # return nn.ReLU(inplace=True)(checkpoint(self.residual_function,x) + checkpoint(self.shortcut,x))class BottleNeck(nn.Module):"""Residual block for resnet over 50 layers"""expansion = 4def __init__(self, in_channels, out_channels, stride=1):super().__init__()self.residual_function = nn.Sequential(nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),nn.BatchNorm2d(out_channels),nn.ReLU(inplace=True),nn.Conv2d(out_channels, out_channels, stride=stride, kernel_size=3, padding=1, bias=False),nn.BatchNorm2d(out_channels),nn.ReLU(inplace=True),nn.Conv2d(out_channels, out_channels * BottleNeck.expansion, kernel_size=1, bias=False),nn.BatchNorm2d(out_channels * BottleNeck.expansion),)self.shortcut = nn.Sequential()if stride != 1 or in_channels != out_channels * BottleNeck.expansion:self.shortcut = nn.Sequential(nn.Conv2d(in_channels, out_channels * BottleNeck.expansion, stride=stride, kernel_size=1, bias=False),nn.BatchNorm2d(out_channels * BottleNeck.expansion))def forward(self, x):return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x)) # return nn.ReLU(inplace=True)(checkpoint(self.residual_function,x) + checkpoint(self.shortcut,x))class ResNet(nn.Module):def __init__(self, block, num_block, num_classes=100):super().__init__()self.in_channels = 64self.conv1 = nn.Sequential(nn.Conv2d(3, 64, kernel_size=3, padding=1, bias=False),nn.BatchNorm2d(64),nn.ReLU(inplace=True))#we use a different inputsize than the original paper#so conv2_x's stride is 1self.conv2_x = self._make_layer(block, 64, num_block[0], 1)self.conv3_x = self._make_layer(block, 128, num_block[1], 2)self.conv4_x = self._make_layer(block, 256, num_block[2], 2)self.conv5_x = self._make_layer(block, 512, num_block[3], 2)self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) # self.fc = nn.Linear(512 * block.expansion, num_classes)self.classFc1 = nn.Linear(512 * block.expansion, 1)self.regFc1 = nn.Linear(512 * block.expansion, 7)def _make_layer(self, block, out_channels, num_blocks, stride):"""make resnet layers(by layer i didnt mean this 'layer' was thesame as a neuron netowork layer, ex. conv layer), one layer maycontain more than one residual blockArgs:block: block type, basic block or bottle neck blockout_channels: output depth channel number of this layernum_blocks: how many blocks per layerstride: the stride of the first block of this layerReturn:return a resnet layer"""# we have num_block blocks per layer, the first block# could be 1 or 2, other blocks would always be 1strides = [stride] + [1] * (num_blocks - 1)layers = []for stride in strides:layers.append(block(self.in_channels, out_channels, stride))self.in_channels = out_channels * block.expansionreturn nn.Sequential(*layers)def forward(self, x): # output = self.conv1(x) # output = self.conv2_x(output) # output = self.conv3_x(output) # output = self.conv4_x(output) # output = self.conv5_x(output) # y = x # y = y + torch.zeros(1, dtype=y.dtype, device=y.device, requires_grad=True)output = checkpoint(self.conv1, x)output = checkpoint(self.conv2_x, output)output = checkpoint(self.conv3_x, output)output = checkpoint(self.conv4_x, output)output = checkpoint(self.conv5_x, output)output = self.avg_pool(output)output = output.view(output.size(0), -1) # output = self.fc(output)# return outputx1 = self.classFc1(output) # x1 = F.sigmoid(x1)x1 = checkpoint(F.sigmoid, x1)# x2 = self.regFc1(output)x2 = checkpoint(self.regFc1, output)return x1, x2 def resnet18():""" return a ResNet 18 object"""return ResNet(BasicBlock, [2, 2, 2, 2])def resnet34():""" return a ResNet 34 object"""return ResNet(BasicBlock, [3, 4, 6, 3])def resnet50():""" return a ResNet 50 object"""return ResNet(BottleNeck, [3, 4, 6, 3])def resnet101():""" return a ResNet 101 object"""return ResNet(BottleNeck, [3, 4, 23, 3])def resnet152():""" return a ResNet 152 object"""return ResNet(BottleNeck, [3, 8, 36, 3])

對(duì)于50層只能降低1/3顯存，但是對(duì)于152層則可以降低一倍以上的顯存。

總結(jié)

以上是生活随笔為你收集整理的pytorch显存不足时的解决办法的全部內(nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò)，歡迎將生活随笔推薦給好友。

上一篇： python运行提示显卡内存不足_ten
下一篇：深度神经网络模型训练时GPU显存不足怎么