My data set size is larger than the memory, how to modify my code?

  dataloader, dataset, deep-learning, python, pytorch

The code below is part of the data set code I defined, which neglect the getitem(self, index) part. But when I train the model with the dataset, because the dataset is too large, cpu memory can not support the dataset. So I wondering how to modify my datset code!

class VimeoDataset(Dataset):
def __init__(self, dataset_name, batch_size=32):
    self.batch_size = batch_size
    self.path = '/data/dachunkai/train_sample/dataset/'
    self.dataset_name = dataset_name
    self.load_data()
    self.h = 256
    self.w = 448
    xx = np.arange(0, self.w).reshape(1,-1).repeat(self.h,0) #xx shape is(256,448)
    yy = np.arange(0, self.h).reshape(-1,1).repeat(self.w,1) #yy shape is(448,256)
    self.grid = np.stack((xx,yy),2).copy()

def __len__(self):
    return len(self.meta_data)

def load_data(self):
    self.train_data = []
    self.flow_data = []
    self.val_data = []
    for i in range(100):
        f = np.load('/data/dachunkai/train_sample/dataset/{}.npz'.format(i))
        if i < 80:
            self.train_data.append(f['i0i1gt'])
            self.flow_data.append(f['ft0ft1'])
        else:
            self.val_data.append(f['i0i1gt'])
    if self.dataset_name == 'train':
        self.meta_data = self.train_data
    else:
        self.meta_data = self.val_data
    self.nr_sample = len(self.meta_data)   

Source: Python Questions

LEAVE A COMMENT