-
Notifications
You must be signed in to change notification settings - Fork 0
/
proxylessNASNets.py
257 lines (233 loc) · 11.2 KB
/
proxylessNASNets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# -*- coding: UTF-8 -*-
'''
ProxylessNASNets From <ProxylessNAS: Direct Neural Architecture Search on Target Task and Hardware>, arXiv:1812.00332.
Ref: https://github.com/mit-han-lab/proxylessnas
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
def _ensure_divisible(number, divisor, min_value=None):
'''
Ensure that 'number' can be 'divisor' divisible
Reference from original tensorflow repo:
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
'''
if min_value is None:
min_value = divisor
new_num = max(min_value, int(number + divisor / 2) // divisor * divisor)
if new_num < 0.9 * number:
new_num += divisor
return new_num
class Bottleneck(nn.Module):
'''
The basic unit of ProxylessNASNets, Inverted Residuals and Linear Bottlenecks proposed in MobileNetV2
'''
def __init__(self, in_channels_num, exp_ratio, out_channels_num, kernel_size, stride, use_residual, BN_momentum, BN_eps):
'''
exp_ratio: exp_size=in_channels_num * exp_ratio, the number of channels in the middle stage of the block
use_residual: True or False -- use residual link or not
NL: nonlinearity, 'RE' or 'HS'
'''
super(Bottleneck, self).__init__()
assert stride in [1, 2]
self.use_residual = use_residual
exp_size = round(in_channels_num * exp_ratio)
if exp_size == in_channels_num:
# Without expansion, the first pointwise convolution is omitted
self.conv = nn.Sequential(
# Depthwise Convolution
nn.Conv2d(in_channels=in_channels_num, out_channels=exp_size, kernel_size=kernel_size, stride=stride,
padding=(kernel_size-1)//2, groups=in_channels_num, bias=False),
nn.BatchNorm2d(num_features=exp_size, momentum=BN_momentum, eps=BN_eps),
nn.ReLU6(inplace=True),
# Linear Pointwise Convolution
nn.Conv2d(in_channels=exp_size, out_channels=out_channels_num, kernel_size=1, stride=1, padding=0, bias=False),
nn.Sequential(OrderedDict([('lastBN', nn.BatchNorm2d(num_features=out_channels_num, momentum=BN_momentum, eps=BN_eps))]))
if use_residual else nn.BatchNorm2d(num_features=out_channels_num, momentum=BN_momentum, eps=BN_eps)
)
else:
# With expansion
self.conv = nn.Sequential(
# Pointwise Convolution for expansion
nn.Conv2d(in_channels=in_channels_num, out_channels=exp_size, kernel_size=1, stride=1, padding=0, bias=False),
nn.BatchNorm2d(num_features=exp_size, momentum=BN_momentum, eps=BN_eps),
nn.ReLU6(inplace=True),
# Depthwise Convolution
nn.Conv2d(in_channels=exp_size, out_channels=exp_size, kernel_size=kernel_size, stride=stride,
padding=(kernel_size-1)//2, groups=exp_size, bias=False),
nn.BatchNorm2d(num_features=exp_size, momentum=BN_momentum, eps=BN_eps),
nn.ReLU6(inplace=True),
# Linear Pointwise Convolution
nn.Conv2d(in_channels=exp_size, out_channels=out_channels_num, kernel_size=1, stride=1, padding=0, bias=False),
nn.Sequential(OrderedDict([('lastBN', nn.BatchNorm2d(num_features=out_channels_num, momentum=BN_momentum, eps=BN_eps))]))
if use_residual else nn.BatchNorm2d(num_features=out_channels_num, momentum=BN_momentum, eps=BN_eps)
)
def forward(self, x):
if self.use_residual:
return self.conv(x) + x
else:
return self.conv(x)
class ProxylessNASNets(nn.Module):
'''
'''
def __init__(self, mode='gpu', num_classes=1000, input_size=224, width_multiplier=1.0, BN_momentum=0.1, BN_eps=1e-3, zero_gamma=False):
'''
configs: setting of the model
mode: gpu, cpu, mobile or mobile_14
'''
super(ProxylessNASNets, self).__init__()
assert mode in ['gpu', 'cpu', 'mobile']
s = 2
if input_size == 32 or input_size == 56:
# using cifar-10, cifar-100, Tiny-ImageNet or Downsampled ImageNet
s = 1
# setting of the model
if mode == 'gpu':
# Configuration of a ProxylessNASNet-GPU Model
configs = [
#kernel_size, exp_ratio, out_channels_num, use_residual, stride
[3, 1, 24, False, 1],
[5, 3, 32, False, s],
[7, 3, 56, False, 2],
[3, 3, 56, True, 1],
[7, 6, 112, False, 2],
[5, 3, 112, True, 1],
[5, 6, 128, False, 1],
[3, 3, 128, True, 1],
[5, 3, 128, True, 1],
[7, 6, 256, False, 2],
[7, 6, 256, True, 1],
[7, 6, 256, True, 1],
[5, 6, 256, True, 1],
[7, 6, 432, False, 1]
]
first_channels_num = 40
last_channels_num = 1728
elif mode == 'cpu':
# Configuration of a ProxylessNASNet-CPU Model
configs = [
#kernel_size, exp_ratio, out_channels_num, use_residual, stride
[3, 1, 24, False, 1],
[3, 6, 32, False, s],
[3, 3, 32, True, 1],
[3, 3, 32, True, 1],
[3, 3, 32, True, 1],
[3, 6, 48, False, 2],
[3, 3, 48, True, 1],
[3, 3, 48, True, 1],
[5, 3, 48, True, 1],
[3, 6, 88, False, 2],
[3, 3, 88, True, 1],
[5, 6, 104, False, 1],
[3, 3, 104, True, 1],
[3, 3, 104, True, 1],
[3, 3, 104, True, 1],
[5, 6, 216, False, 2],
[5, 3, 216, True, 1],
[5, 3, 216, True, 1],
[3, 3, 216, True, 1],
[5, 6, 360, False, 1]
]
first_channels_num = 40
last_channels_num = 1432
elif mode == 'mobile':
# Configuration of a ProxylessNASNet-Mobile Model
configs = [
#kernel_size, exp_ratio, out_channels_num, use_residual, stride
[3, 1, 16, False, 1],
[5, 3, 32, False, s],
[3, 3, 32, True, 1],
[7, 3, 40, False, 2],
[3, 3, 40, True, 1],
[5, 3, 40, True, 1],
[5, 3, 40, True, 1],
[7, 6, 80, False, 2],
[5, 3, 80, True, 1],
[5, 3, 80, True, 1],
[5, 3, 80, True, 1],
[5, 6, 96, False, 1],
[5, 3, 96, True, 1],
[5, 3, 96, True, 1],
[5, 3, 96, True, 1],
[7, 6, 192, False, 2],
[7, 6, 192, True, 1],
[7, 3, 192, True, 1],
[7, 3, 192, True, 1],
[7, 6, 320, False, 1]
]
first_channels_num = 32
last_channels_num = 1280
divisor = 8
########################################################################################################################
# feature extraction part
# input layer
input_channels_num = _ensure_divisible(first_channels_num * width_multiplier, divisor)
last_channels_num = _ensure_divisible(last_channels_num * width_multiplier, divisor) if width_multiplier > 1 else last_channels_num
feature_extraction_layers = []
first_layer = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=input_channels_num, kernel_size=3, stride=s, padding=1, bias=False),
nn.BatchNorm2d(num_features=input_channels_num, momentum=BN_momentum, eps=BN_eps),
nn.ReLU6(inplace=True)
)
feature_extraction_layers.append(first_layer)
# Overlay of multiple bottleneck structures
for kernel_size, exp_ratio, out_channels_num, use_residual, stride in configs:
output_channels_num = _ensure_divisible(out_channels_num * width_multiplier, divisor)
feature_extraction_layers.append(Bottleneck(in_channels_num=input_channels_num, exp_ratio=exp_ratio, out_channels_num=output_channels_num, kernel_size=kernel_size, stride=stride, use_residual=use_residual, BN_momentum=BN_momentum, BN_eps=BN_eps))
input_channels_num = output_channels_num
# the last stage
feature_mix_layer = nn.Sequential(
nn.Conv2d(in_channels=input_channels_num, out_channels=last_channels_num, kernel_size=1, stride=1, padding=0, bias=False),
nn.BatchNorm2d(num_features=last_channels_num, momentum=BN_momentum, eps=BN_eps),
nn.ReLU6()
)
feature_extraction_layers.append(feature_mix_layer)
feature_extraction_layers.append(nn.AdaptiveAvgPool2d(1))
self.features = nn.Sequential(*feature_extraction_layers)
########################################################################################################################
# Classification part
self.classifier = nn.Sequential(
nn.Linear(last_channels_num, num_classes)
)
########################################################################################################################
# Initialize the weights
self._initialize_weights(zero_gamma)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
def _initialize_weights(self, zero_gamma):
'''
Initialize the weights
'''
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, std=0.001)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
if zero_gamma:
for m in self.modules():
if hasattr(m, 'lastBN'):
nn.init.constant_(m.lastBN.weight, 0.0)
if __name__ == "__main__":
import argparse
from torchsummaryX import summary
parser = argparse.ArgumentParser(description='width multiplier')
parser.add_argument('--mode', type=str, default='gpu')
parser.add_argument('--num-classes', type=int, default=1000)
parser.add_argument('--input-size', type=int, default=224)
parser.add_argument('--wm', type=float, default=1.0)
args = parser.parse_args()
model = ProxylessNASNets(mode=args.mode, num_classes=args.num_classes, input_size=args.input_size, width_multiplier=args.wm)
model.eval()
summary(model, torch.zeros((1, 3, args.input_size, args.input_size)))
print('ProxylessNASNet-%s-%.2f with input size %d and output %d classes' % (args.mode, args.wm, args.input_size, args.num_classes))