-
Notifications
You must be signed in to change notification settings - Fork 0
/
datasets.py
74 lines (60 loc) · 2.56 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import torchvision
import torchvision.transforms as transforms
import numpy as np
import random
import os
import torch
from torch.utils.data import DataLoader
class SampledDataset(torch.utils.data.Dataset):
def __init__(self, dataset, indices):
self.dataset = dataset
self.indices = indices
assert len(dataset) >= len(indices)
def __getitem__(self, index):
return self.dataset.__getitem__(self.indices[index])
def __len__(self):
return len(self.indices)
def get_val_indices(datadir, size, force=False):
filename = os.path.join(datadir, 'val_indices_%d.txt'%size)
if os.path.isfile(filename) and not force:
with open(filename, 'r') as f:
indices = [int(i.strip()) for i in f.readlines()]
assert len(indices) == size
else:
indices = sorted(random.sample(list(range(50000)), size))
with open(filename, 'w') as f:
f.write('\n'.join([str(i) for i in indices]))
return indices
def get_dataset(name, **kwargs):
datadir = os.path.expanduser('~/data/' + name)
if not os.path.exists(datadir):
os.mkdir(datadir)
if name == 'cifar10' or name == 'cifar100':
mean = np.array([125.3, 123.0, 113.9]) / 255.0
std = np.array([63.0, 62.1, 66.7]) / 255.0
if name == 'cifar10':
data = torchvision.datasets.CIFAR10
else:
data = torchvision.datasets.CIFAR100
train_transform = transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
transforms.Normalize(mean, std)])
test_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean, std)])
val_size = kwargs.get('val_size', 5000)
val_indices = get_val_indices(datadir, val_size)
train_indices = sorted(list(set(range(50000))-set(val_indices)))
trainset = data(datadir, train=True, transform=train_transform, download=True)
trainset = SampledDataset(trainset, train_indices)
valset = data(datadir, train=True, transform=test_transform, download=True)
valset = SampledDataset(valset, val_indices)
testset = data(datadir, train=False, transform=test_transform, download=True)
return trainset, valset, testset
else:
raise Exception('unknown dataset: %s'%dataset)
def get_dataloader(datasets, batch_size, shuffle=True):
return [DataLoader(d, batch_size=batch_size, shuffle=True, num_workers=4)
for d in datasets]