生活随笔
收集整理的這篇文章主要介紹了
pytorch显存不足时的解决办法
小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
將float32轉(zhuǎn)化為float16,是最有效的降低顯存占用的方式,可以降低一半左右的顯存占用。
實(shí)現(xiàn)方式:首先在代碼的最前面加上
torch
.set_default_dtype
(torch
.float16
)
這行代碼將這個(gè)程序內(nèi)部所有的float變量轉(zhuǎn)化為float32。
此時(shí)如果直接運(yùn)行程序會(huì)出現(xiàn)輸入為float參數(shù)為floathalf的錯(cuò)誤報(bào)告。
然后將輸入改變?yōu)閒loathalf即可
代碼如下
inputs
= inputs
.type(torch
.float16
)
方法1不建議采用,在后面使用cuDNN加速時(shí)候容易報(bào)
RuntimeError
: cuDNN error
: CUDNN_STATUS_EXECUTION_FAILED
You can
try to repro this exception using the following code snippet
. If that doesn't trigger the error
, please include your original repro script when reporting this issue
.import torch
torch
.backends
.cuda
.matmul
.allow_tf32
= True
torch
.backends
.cudnn
.benchmark
= True
torch
.backends
.cudnn
.deterministic
= False
torch
.backends
.cudnn
.allow_tf32
= True
data
= torch
.randn
([4, 32, 119, 159], dtype
=torch
.half
, device
='cuda', requires_grad
=True)
net
= torch
.nn
.Conv2d
(32, 64, kernel_size
=[5, 5], padding
=[0, 0], stride
=[2, 2], dilation
=[1, 1], groups
=1)
net
= net
.cuda
().half
()
out
= net
(data
)
out
.backward
(torch
.randn_like
(out
))
torch
.cuda
.synchronize
()ConvolutionParams data_type
= CUDNN_DATA_HALFpadding
= [0, 0, 0]stride
= [2, 2, 0]dilation
= [1, 1, 0]groups
= 1deterministic
= falseallow_tf32
= true
input: TensorDescriptor
000001E834DE5180type = CUDNN_DATA_HALFnbDims
= 4dimA
= 4, 32, 119, 159, strideA
= 605472, 18921, 159, 1,
output
: TensorDescriptor
000001E834DE3AC0type = CUDNN_DATA_HALFnbDims
= 4dimA
= 4, 64, 58, 78, strideA
= 289536, 4524, 78, 1,
weight
: FilterDescriptor
000001E8349A6610type = CUDNN_DATA_HALFtensor_format
= CUDNN_TENSOR_NCHWnbDims
= 4dimA
= 64, 32, 5, 5,
Pointer addresses
: input: 0000002363108000output
: 00000023637A8800weight
: 0000002305E01600
Additional pointer addresses
: grad_output
: 00000023637A8800grad_input
: 0000002363108000
Backward data algorithm
: 1
的錯(cuò)誤。
2. 利用pytorch的checkpoint特性,可以極大地降低顯存的使用。
實(shí)現(xiàn)方式:
在densenet中,官方給出了densenet的checkpoint實(shí)現(xiàn),地址densenet可以通過在densenet參數(shù)中填入 memory_efficient=True來實(shí)現(xiàn)顯存的降低。
對(duì)于其它網(wǎng)絡(luò),則可以通過
import torch
import torch
.nn
as nn
import torch
.nn
.functional
as F
from torch
.utils
.checkpoint
import checkpoint
from torchvision
.datasets
.cifar
import CIFAR10
import numpy
as np
from progressbar
import progressbar
def conv_bn_relu(in_ch
, out_ch
, ker_sz
, stride
, pad
):return nn
.Sequential
(nn
.Conv2d
(in_ch
, out_ch
, ker_sz
, stride
, pad
, bias
=False),nn
.BatchNorm2d
(out_ch
),nn
.ReLU
())class NetA(nn
.Module
):def __init__(self
, use_checkpoint
=False):super().__init__
()self
.use_checkpoint
= use_checkpointk
= 2self
.layer1
= conv_bn_relu
(3, 32*k
, 3, 1, 1)self
.layer2
= conv_bn_relu
(32*k
, 32*k
, 3, 2, 1)self
.layer3
= conv_bn_relu
(32*k
, 64*k
, 3, 1, 1)self
.layer4
= conv_bn_relu
(64*k
, 64*k
, 3, 2, 1)self
.layer5
= conv_bn_relu
(64*k
, 128*k
, 3, 1, 1)self
.layer6
= conv_bn_relu
(128*k
, 128*k
, 3, 2, 1)self
.layer7
= conv_bn_relu
(128*k
, 256*k
, 3, 1, 1)self
.layer8
= conv_bn_relu
(256*k
, 256*k
, 3, 2, 1)self
.layer9
= nn
.Linear
(256*k
, 10)def seg0(self
, y
):y
= self
.layer1
(y
)return y
def seg1(self
, y
):y
= self
.layer2
(y
)y
= self
.layer3
(y
)return y
def seg2(self
, y
):y
= self
.layer4
(y
)y
= self
.layer5
(y
)return y
def seg3(self
, y
):y
= self
.layer6
(y
)y
= self
.layer7
(y
)return y
def seg4(self
, y
):y
= self
.layer8
(y
)y
= F
.adaptive_avg_pool2d
(y
, 1)y
= torch
.flatten
(y
, 1)y
= self
.layer9
(y
)return y
def forward(self
, x
):y
= xy
= y
+ torch
.zeros
(1, dtype
=y
.dtype
, device
=y
.device
, requires_grad
=True)if self
.use_checkpoint
:y
= checkpoint
(self
.seg0
, y
)y
= checkpoint
(self
.seg1
, y
)y
= checkpoint
(self
.seg2
, y
)y
= checkpoint
(self
.seg3
, y
)y
= checkpoint
(self
.seg4
, y
)else:y
= self
.seg0
(y
)y
= self
.seg1
(y
)y
= self
.seg2
(y
)y
= self
.seg3
(y
)y
= self
.seg4
(y
)return y
if __name__
== '__main__':net
= NetA
(use_checkpoint
=True).cuda
()train_dataset
= CIFAR10
('../datasets/cifar10', True, download
=True)train_x
= np
.asarray
(train_dataset
.data
, np
.uint8
)train_y
= np
.asarray
(train_dataset
.targets
, np
.int)losser
= nn
.CrossEntropyLoss
()optim
= torch
.optim
.Adam
(net
.parameters
(), 1e-3)epoch
= 10batch_size
= 31batch_count
= int(np
.ceil
(len(train_x
) / batch_size
))for e_id
in range(epoch
):print('epoch', e_id
)print('training')net
.train
()loss_sum
= 0for b_id
in progressbar
(range(batch_count
)):optim
.zero_grad
()batch_x
= train_x
[batch_size
*b_id
: batch_size
*(b_id
+1)]batch_y
= train_y
[batch_size
*b_id
: batch_size
*(b_id
+1)]batch_x
= torch
.from_numpy
(batch_x
).permute
(0, 3, 1, 2).float() / 255.batch_y
= torch
.from_numpy
(batch_y
).long()batch_x
= batch_x
.cuda
()batch_y
= batch_y
.cuda
()batch_x
= F
.interpolate
(batch_x
, (224, 224), mode
='bilinear')y
= net
(batch_x
)loss
= losser
(y
, batch_y
)loss
.backward
()optim
.step
()loss_sum
+= loss
.item
()print('loss', loss_sum
/ batch_count
)with torch
.no_grad
():print('testing')net
.eval()acc_sum
= 0for b_id
in progressbar
(range(batch_count
)):optim
.zero_grad
()batch_x
= train_x
[batch_size
* b_id
: batch_size
* (b_id
+ 1)]batch_y
= train_y
[batch_size
* b_id
: batch_size
* (b_id
+ 1)]batch_x
= torch
.from_numpy
(batch_x
).permute
(0, 3, 1, 2).float() / 255.batch_y
= torch
.from_numpy
(batch_y
).long()batch_x
= batch_x
.cuda
()batch_y
= batch_y
.cuda
()batch_x
= F
.interpolate
(batch_x
, (224, 224), mode
='bilinear')y
= net
(batch_x
)y
= torch
.topk
(y
, 1, dim
=1).indicesy
= y
[:, 0]acc
= (y
== batch_y
).float().sum() / len(batch_x
)acc_sum
+= acc
.item
()print('acc', acc_sum
/ batch_count
)ids
= np
.arange
(len(train_x
))np
.random
.shuffle
(ids
)train_x
= train_x
[ids
]train_y
= train_y
[ids
]
這種方式添加。
但是在實(shí)際修改resnet的過程中,發(fā)現(xiàn)有時(shí)候修改了以后并不會(huì)減小顯存,但是過一段時(shí)間之后又可以減少,且自己實(shí)現(xiàn)的減小顯存的效率并沒有官方給的高。
import torch
import torch
.nn
as nn
import torch
.nn
.functional
as F
from torch
.utils
.checkpoint
import checkpoint
class BasicBlock(nn
.Module
):"""Basic Block for resnet 18 and resnet 34"""expansion
= 1def __init__(self
, in_channels
, out_channels
, stride
=1):super().__init__
()self
.residual_function
= nn
.Sequential
(nn
.Conv2d
(in_channels
, out_channels
, kernel_size
=3, stride
=stride
, padding
=1, bias
=False),nn
.BatchNorm2d
(out_channels
),nn
.ReLU
(inplace
=True),nn
.Conv2d
(out_channels
, out_channels
* BasicBlock
.expansion
, kernel_size
=3, padding
=1, bias
=False),nn
.BatchNorm2d
(out_channels
* BasicBlock
.expansion
))self
.shortcut
= nn
.Sequential
()if stride
!= 1 or in_channels
!= BasicBlock
.expansion
* out_channels
:self
.shortcut
= nn
.Sequential
(nn
.Conv2d
(in_channels
, out_channels
* BasicBlock
.expansion
, kernel_size
=1, stride
=stride
, bias
=False),nn
.BatchNorm2d
(out_channels
* BasicBlock
.expansion
))def forward(self
, x
):return nn
.ReLU
(inplace
=True)(self
.residual_function
(x
) + self
.shortcut
(x
))
class BottleNeck(nn
.Module
):"""Residual block for resnet over 50 layers"""expansion
= 4def __init__(self
, in_channels
, out_channels
, stride
=1):super().__init__
()self
.residual_function
= nn
.Sequential
(nn
.Conv2d
(in_channels
, out_channels
, kernel_size
=1, bias
=False),nn
.BatchNorm2d
(out_channels
),nn
.ReLU
(inplace
=True),nn
.Conv2d
(out_channels
, out_channels
, stride
=stride
, kernel_size
=3, padding
=1, bias
=False),nn
.BatchNorm2d
(out_channels
),nn
.ReLU
(inplace
=True),nn
.Conv2d
(out_channels
, out_channels
* BottleNeck
.expansion
, kernel_size
=1, bias
=False),nn
.BatchNorm2d
(out_channels
* BottleNeck
.expansion
),)self
.shortcut
= nn
.Sequential
()if stride
!= 1 or in_channels
!= out_channels
* BottleNeck
.expansion
:self
.shortcut
= nn
.Sequential
(nn
.Conv2d
(in_channels
, out_channels
* BottleNeck
.expansion
, stride
=stride
, kernel_size
=1, bias
=False),nn
.BatchNorm2d
(out_channels
* BottleNeck
.expansion
))def forward(self
, x
):return nn
.ReLU
(inplace
=True)(self
.residual_function
(x
) + self
.shortcut
(x
))
class ResNet(nn
.Module
):def __init__(self
, block
, num_block
, num_classes
=100):super().__init__
()self
.in_channels
= 64self
.conv1
= nn
.Sequential
(nn
.Conv2d
(3, 64, kernel_size
=3, padding
=1, bias
=False),nn
.BatchNorm2d
(64),nn
.ReLU
(inplace
=True))self
.conv2_x
= self
._make_layer
(block
, 64, num_block
[0], 1)self
.conv3_x
= self
._make_layer
(block
, 128, num_block
[1], 2)self
.conv4_x
= self
._make_layer
(block
, 256, num_block
[2], 2)self
.conv5_x
= self
._make_layer
(block
, 512, num_block
[3], 2)self
.avg_pool
= nn
.AdaptiveAvgPool2d
((1, 1))
self
.classFc1
= nn
.Linear
(512 * block
.expansion
, 1)self
.regFc1
= nn
.Linear
(512 * block
.expansion
, 7)def _make_layer(self
, block
, out_channels
, num_blocks
, stride
):"""make resnet layers(by layer i didnt mean this 'layer' was thesame as a neuron netowork layer, ex. conv layer), one layer maycontain more than one residual blockArgs:block: block type, basic block or bottle neck blockout_channels: output depth channel number of this layernum_blocks: how many blocks per layerstride: the stride of the first block of this layerReturn:return a resnet layer"""strides
= [stride
] + [1] * (num_blocks
- 1)layers
= []for stride
in strides
:layers
.append
(block
(self
.in_channels
, out_channels
, stride
))self
.in_channels
= out_channels
* block
.expansion
return nn
.Sequential
(*layers
)def forward(self
, x
):
output
= checkpoint
(self
.conv1
, x
)output
= checkpoint
(self
.conv2_x
, output
)output
= checkpoint
(self
.conv3_x
, output
)output
= checkpoint
(self
.conv4_x
, output
)output
= checkpoint
(self
.conv5_x
, output
)output
= self
.avg_pool
(output
)output
= output
.view
(output
.size
(0), -1)
x1
= self
.classFc1
(output
)
x1
= checkpoint
(F
.sigmoid
, x1
)x2
= checkpoint
(self
.regFc1
, output
)return x1
, x2
def resnet18():""" return a ResNet 18 object"""return ResNet
(BasicBlock
, [2, 2, 2, 2])def resnet34():""" return a ResNet 34 object"""return ResNet
(BasicBlock
, [3, 4, 6, 3])def resnet50():""" return a ResNet 50 object"""return ResNet
(BottleNeck
, [3, 4, 6, 3])def resnet101():""" return a ResNet 101 object"""return ResNet
(BottleNeck
, [3, 4, 23, 3])def resnet152():""" return a ResNet 152 object"""return ResNet
(BottleNeck
, [3, 8, 36, 3])
對(duì)于50層只能降低1/3顯存,但是對(duì)于152層則可以降低一倍以上的顯存。
總結(jié)
以上是生活随笔為你收集整理的pytorch显存不足时的解决办法的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò),歡迎將生活随笔推薦給好友。