模型训练加速方法
模型訓練加速方法
-
學習率設置
- lr = 0.00125*num_gpu*samples_per_gpu
數據讀取加速
-
data prefetch (Nvidia Apex中提供的解決方案)
# pip install prefetch_generatorfrom torch.utils.data import DataLoaderfrom prefetch_generator import BackgroundGenerator# 使用DataLoaderX代替DataLoaderclass DataLoaderX(DataLoader):def __iter__(self):return BackgroundGenerator(super().__iter__()) -
cuda.Steam加速拷貝過程
""" 該代碼是在使用amp半精度計算的條件下:否則加 if args.fp16:self.mean = self.mean.half()self.std = self.std.half() """ class DataPrefetcher():def __init__(self, loader, opt):self.loader = iter(loader)self.opt = optself.stream = torch.cuda.Stream()self.preload()def preload(self):try:self.batch = next(self.loader)except StopIteration:self.batch = Nonereturnwith torch.cuda.stream(self.stream):for k in self.batch:if k != 'meta':self.batch[k] = self.batch[k].to(device=self.opt.device, non_blocking=True)def next(self):torch.cuda.current_stream().wait_stream(self.stream)batch = self.batchself.preload()return batchclass data_prefetcher():def __init__(self, loader):self.loader = iter(loader)self.stream = torch.cuda.Stream()self.mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).cuda().view(1,3,1,1)self.std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).cuda().view(1,3,1,1)# With Amp, it isn't necessary to manually convert data to half.# if args.fp16:# self.mean = self.mean.half()# self.std = self.std.half()self.preload()def preload(self):try:self.next_input, self.next_target = next(self.loader)except StopIteration:self.next_input = Noneself.next_target = Nonereturnwith torch.cuda.stream(self.stream):self.next_input = self.next_input.cuda(non_blocking=True)self.next_target = self.next_target.cuda(non_blocking=True)# With Amp, it isn't necessary to manually convert data to half.# if args.fp16:# self.next_input = self.next_input.half()# else:self.next_input = self.next_input.float()self.next_input = self.next_input.sub_(self.mean).div_(self.std)def next(self):torch.cuda.current_stream().wait_stream(self.stream)input = self.next_inputtarget = self.next_targetself.preload()return input, target# 加入前數據加載: for iter_id, batch in enumerate(data_loader):if iter_id >= num_iters:breakfor k in batch:if k != 'meta':batch[k] = batch[k].to(device=opt.device, non_blocking=True)run_step()# 加入后加載數據 prefetcher = DataPrefetcher(data_loader, opt) batch = prefetcher.next() iter_id = 0 while batch is not None:iter_id += 1if iter_id >= num_iters:breakrun_step()batch = prefetcher.next() -
OCR模型訓練tricks
-
標點符號:在建立數據集的時候,需要將中文的如[,.’ ";:]等標點符號換成英文的,或者反過來,不要有兩份一樣的,因為目前不論是attention_ocr還是ctc都算是象形文字,所以模型看到中文分號和英文分號,總覺得是同一個東西,所以會分錯;
-
訓練集:在建立數據集的時候,因為ctc_loss中有個sequence_length,所以,為了增加數據分布一致性和ctc的效率,最好先對圖片對應的文字進行長度排序,比如前面100個樣本的label都是小于5的字符串;后面100個都是小于10的字符串;后面100個都是小于15的字符串,等等。
-
batch間獨立,batch內相等:在讀取數據的時候,同一個batch中因為圖片大小需要相同,而如果是全卷積網絡,是可以讓不同batch之間獨立的。所以圖片的縮放可以按照batch之間各自決定。比如第一個batch 讀取長度小于5的label和圖片,將其縮放到100*32;第二個讀取長度小于10的label和圖片,將其縮放到200**32;
-
訓練集雙尾問題:為了數據的平衡性,需要將數據集中出現次數特別少的和出現次數特別多的label的樣本刪除,保證每個字符的頻率都適中;
-
-
pytorch處理類別不均衡問題
- # 數據方面 import torch from torch.utils.data.dataset import random_split from torch.utils.data import DataLoader, WeightedRandomSampler from collections import Countor def load_data(sample):train_data = Nonetrain_set, val_set = random_split(train_full, [math.floor(len(train_full)*0.8), math.ceil(len(train_full)*0.2)])self.train_classes = [label for _, label in train_set]if sample:# Need to get weight for every image in the datasetclass_count = Counter(self.train_classes)class_weights = torch.Tensor([len(self.train_classes)/c for c in pd.Series(class_count).sort_index().values]) # Can't iterate over class_count because dictionary is unorderedsample_weights = [0] * len(train_set)for idx, (image, label) in enumerate(train_set):class_weight = class_weights[label]sample_weights[idx] = class_weightsampler = WeightedRandomSampler(weights=sample_weights,num_samples = len(train_set), replacement=True) train_loader = DataLoader(train_set, batch_size=self.batch_size, sampler=sampler)else:train_loader = DataLoader(train_set, batch_size=self.batch_size, shuffle=True)val_loader = DataLoader(val_set, batch_size=self.batch_size)return train_loader, val_loader# 模型訓練loss加權重 def load_model(self, arch='resnet'):if arch == 'resnet':self.model = torchvision.models.resnet50(pretrained=True)if self.freeze_backbone:for param in self.model.parameters():param.requires_grad = Falseself.model.fc = nn.Linear(in_features=self.model.fc.in_features, out_features=self.num_classes)elif arch == 'efficient-net':self.model = EfficientNet.from_pretrained('efficientnet-b7')if self.freeze_backbone:for param in self.model.parameters():param.requires_grad = Falseself.model._fc = nn.Linear(in_features=self.model._fc.in_features, out_features=self.num_classes) self.model = self.model.to(self.device)self.optimizer = torch.optim.Adam(self.model.parameters(), self.lr) if self.loss_weights:class_count = Counter(self.train_classes)class_weights = torch.Tensor([len(self.train_classes)/c for c in pd.Series(class_count).sort_index().values])# Cant iterate over class_count because dictionary is unorderedclass_weights = class_weights.to(self.device) self.criterion = nn.CrossEntropyLoss(class_weights)else:self.criterion = nn.CrossEntropyLoss()
-
早期停止
- #Callbacks # Early stopping class EarlyStopping:def __init__(self, patience=1, delta=0, path='checkpoint.pt'):self.patience = patienceself.delta = deltaself.path= pathself.counter = 0self.best_score = Noneself.early_stop = Falsedef __call__(self, val_loss, model):if self.best_score is None:self.best_score = val_lossself.save_checkpoint(model)elif val_loss > self.best_score:self.counter +=1if self.counter >= self.patience:self.early_stop = True else:self.best_score = val_lossself.save_checkpoint(model)self.counter = 0 def save_checkpoint(self, model):torch.save(model.state_dict(), self.path)
總結
- 上一篇: python实用小技巧
- 下一篇: 详细分解Transformer各部件总结