-
Notifications
You must be signed in to change notification settings - Fork 12
/
Modified_CNN.py
375 lines (310 loc) · 17.2 KB
/
Modified_CNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
import torchvision
from torch.nn.init import normal_, constant_
from torch import nn
import numpy as np
import torch
from basic_ops import ConsensusModule, Identity
from transforms import *
#nn.Moudle is a base class, the model class should subclass this one
class TSN_model(nn.Module):
def __init__ (self, num_classes, num_segments, modality,
consensus_type='avg', base_model_name='resnet18',
new_length=None, before_softmax=True, dropout=0.8,
crop_num=1, partial_bn=True , KinWeights=''):
#Excute all nn.Moudle __init__ fuction stuff before anything as a base class.
super(TSN_model, self).__init__()
self.num_classes = num_classes
self.num_segments = num_segments
self.modality = modality
self.base_model_name = base_model_name
self.consensus_type = consensus_type
self.before_softmax = before_softmax
self.dropout = dropout
self.crop_num = crop_num
self.partial_bn = partial_bn
self.KinWeights = KinWeights
if not before_softmax and consensus_type != 'avg':
raise ValueError("Only avg consensus can be used after Softmax")
#Setting the number of frames picked from each segments
if new_length is None:
self.new_length = 1 if modality == "RGB" else 5
else:
self.new_length = new_length
print(("""
Initializing TSN with base model: {}.
TSN Configurations:
input_modality: {}
num_segments: {}
new_length: {}
consensus_module: {}
dropout_ratio: {}
""".format(base_model_name, self.modality, self.num_segments,
self.new_length, self.consensus_type, self.dropout)))
#Prepare the standard model
print('Load and modify the standard model FC output layer')
self.prepare_model(base_model_name, num_classes)
print('Done. Loading and Modifying \n ---------------------------------------------------')
if self.modality == 'RGBDiff':
print("Converting the ImageNet model to RGBDiff model")
self.base_model = self.Modify_RGBDiff_Model(self.base_model, keep_rgb=False)
print("Done. RGBDiff model is ready.")
if self.KinWeights :
self.AddKinWeights(self.KinWeights)
#Creating Consensus layer (Only 'avg' and 'identity' are available)
self.consensus = ConsensusModule(consensus_type)
#Creating softmax Layer if necessary
if not self.before_softmax:
self.softmax = nn.Softmax()
self._enable_pbn = partial_bn
if partial_bn:
self.partialBN(True)
def prepare_model(self, base_model_name, num_classes):
"""
base_model: string contains the model name
this function is used to modify the last layer (fully connected layer)
for a given architecture to suit our dataset number of actions.
"""
#add other architectures later
if 'resnet' in base_model_name:
#Load pretrained model
self.base_model = getattr(torchvision.models, base_model_name)(pretrained=True)
self.last_layer_name = 'fc'
#set the input size for the model
self.input_size = 224
self.input_mean = [0.485, 0.456, 0.406]
self.input_std = [0.229, 0.224, 0.225]
#There's no point of substarct means from RGBDiff frames
if self.modality == 'RGBDiff':
#[0.485, 0.456, 0.406 , 0, 0, 0, 0, 0,.....]
self.input_mean = [0.485, 0.456, 0.406] + [0] * 3 * self.new_length
#Expand the list with the average 0.452
self.input_std = self.input_std + [np.mean(self.input_std) * 2] * 3 * self.new_length
#BNInception doesn't exist in torchvision models, so we have to get it from net folder
elif base_model_name == 'BNInception':
import net
self.base_model = net.bn_inception(pretrained = True)
self.last_layer_name = 'last_linear'
self.input_size = 224
self.input_mean = [104, 117, 128]
self.input_std = [1]
#BNInception takes input in range of (0~255), so only mean subtraction should be used in preprocessing.
#This is different from ResNet models, which take input in the range of (0~1).
if self.modality == 'RGBDiff':
self.input_mean = self.input_mean * (1+self.new_length)
else:
raise ValueError('Unknown base model: {}'.format(base_model_name))
#Get the input size for the last layer of CNN
features_dim = getattr(self.base_model, self.last_layer_name).in_features
#In case of no dropout,Only nn.Linear will be added
if self.dropout == 0:
setattr(self.base_model, self.last_layer_name, nn.Linear(features_dim, num_classes))
self.new_fc = None
print('The modified linear layer is :', getattr(self.base_model, self.last_layer_name))
#In case of dropout, only nn.Dropout will be added and nn.Linear will be prepared to be added later
else:
setattr(self.base_model, self.last_layer_name, nn.Dropout(self.dropout))
self.new_fc = nn.Linear(features_dim, num_classes)
print('Dropout Layer added and The modified linear layer is :',self.new_fc)
#Modify Wighets of newly created Linear layer
std=0.001
if self.new_fc == None:
normal_(getattr(self.base_model, self.last_layer_name).weight,0,std)
constant_(getattr(self.base_model, self.last_layer_name).bias,0)
else:
normal_(self.new_fc.weight, 0, std)
constant_(self.new_fc.bias,0)
def Modify_RGBDiff_Model(self, base_model, keep_rgb=False):
"""
This function to update our first conv2d layer with the appropriate
number of channels to be suited for the number of frames for each segment.
"""
modules = list(self.base_model.modules())
#check the index for the first conv2d layer
for i in range(len(modules)):
if isinstance(modules[i], nn.Conv2d):
first_conv_idx = i
break
#A copy from the 1st Conv2d layer
conv_layer = modules[first_conv_idx]
container = modules[first_conv_idx-1]
#List which its first element is a (64,3,3,3) tensor (Weights),And the second element is Bais tensor
params = [x.clone() for x in conv_layer.parameters()]
#(64,3,3,3) for resnet18
kernel_size = params[0].size()
#Modify first layer's size and Weights to suit the input in case of RGBDiff
#(Used to be 5 stacked frames)
if not keep_rgb:
new_kernel_size = kernel_size[:1] + (3 * self.new_length,) + kernel_size[2:]
new_kernel_Weights = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()
else:
new_kernel_size = kernel_size[:1] + (3 * self.new_length,) + kernel_size[2:]
new_kernel_Weights = torch.cat((params[0].data,params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()),1)
new_kernel_size = new_kernel_Weights.size()
#Create new input layer with proper size.
new_conv = nn.Conv2d(new_kernel_size[1], conv_layer.out_channels, conv_layer.kernel_size,
conv_layer.stride, conv_layer.padding, bias=True if len(params)==2 else False)
#Override the parameters
new_conv.weight.data = new_kernel_Weights
if len(params) == 2:
# add bias if neccessary
new_conv.bias.data = params[1].data
#remove .weight suffix to get the layer name
layer_name = list(container.state_dict().keys())[0][:-7]
# replace the first convolution layer
setattr(container, layer_name, new_conv)
print('The modified 1st layer is',new_conv)
return base_model
def AddKinWeights(self,KinWeights):
state_dictTemp = {}
print('Loading Kinetics weights')
state_dict = torch.load(KinWeights)
if self.modality == 'RGBDiff':
print('Convert flow weights to RGBDiff weights')
state_dict['conv1_7x7_s2.weight'] = state_dict['conv1_7x7_s2.weight'].mean(dim=1,keepdim=True).expand([64,3 * self.new_length,7,7]).contiguous().float()
state_dict = {'base_model.'+ k : v for k,v in state_dict.items()}
if self.dropout == 0:
for k, v in state_dict.items():
if k == 'base_model.fc_action.weight':
state_dictTemp["base_model.last_linear.weight"] = getattr(self.base_model, self.last_layer_name).weight
elif k == 'base_model.fc_action.bias':
state_dictTemp["base_model.last_linear.bias"] = getattr(self.base_model, self.last_layer_name).bias
else:
state_dictTemp[k]=torch.squeeze(v, dim=0)
else:
for k, v in state_dict.items():
if k == 'base_model.fc_action.weight':
state_dictTemp["new_fc.weight"] = self.new_fc.weight
elif k == 'base_model.fc_action.bias':
state_dictTemp["new_fc.bias"] = self.new_fc.bias
else:
state_dictTemp[k]=torch.squeeze(v, dim=0)
self.load_state_dict(state_dictTemp)
def train(self, mode=True):
"""
this function freezes batch normalization layers except the first one
For any more details, please refer to this paper: https://arxiv.org/pdf/1502.03167.pdf
inputs: mode (True for the training process)
"""
super(TSN_model, self).train(mode)
count=0
#check if partial batch normalization is activated
if self.enable_pbn:
for m in self.base_model.modules():
if isinstance(m, nn.BatchNorm2d):
#freeze the layers except the first one
if not self.partial_bn or count > 0:
m.eval()
m.weight.requires_grad = False
m.bias.requires_grad = False
else:
count=+1
continue
def extract_rgbDiff(self,RGB_tensor,keep_rgb=False):
"""
This function for subtracting consecutive frames to obtain RGB difference
frames. the length of frames for each segment is usually set to be 5
(you can change it but this gave the best accuracy in the paper).
RGB_tensor : Tensor contian all frames --Size(Batch_size,num_segments*new_length*3,H,W)
keep_rgb : Boolean True(Keep an RGB frame [RGB, RGBDiff, RGBDiff, RGBDiff....])
False(All frames are RGBDiff)
RGBDiff_tensor :Tensor in shape of (Batch_size,num_segments,new_length,3,H,W)
"""
#Reshape the tensor to
#(batch_size, Num of segments, Number of picked frames, C, H, W)
RGB_tensor = RGB_tensor.view((-1 , self.num_segments , self.new_length+1 , 3 ) + RGB_tensor.size()[2:])
#keep_rgb is a trial from the author to improve accuracy.
#it is kept false as it only leads to marginal increase.
if keep_rgb:
RGBDiff_tensor= RGB_tensor.clone()
else:
RGBDiff_tensor = RGB_tensor[:, :, 1:, :, :, :].clone()
#Generate RGBDiff frames
#if keep_rgb is set to True, then we will use two streams,
#one for RGB and one for RGB Diff, so we have to leave the first frame
#in RGB_tensor non-subtracted
for x in reversed(list(range(1, self.new_length + 1))):
if keep_rgb:
RGBDiff_tensor[:, :, x, :, :, :] = RGB_tensor[:, :, x, :, :, :] - RGB_tensor[:, :, x - 1, :, :, :]
else:
RGBDiff_tensor[:, :, x - 1, :, :, :] = RGB_tensor[:, :, x, :, :, :] - RGB_tensor[:, :, x - 1, :, :, :]
return RGBDiff_tensor
def partialBN(self, enable):
self.enable_pbn = enable
def get_optim_policies(self):
first_conv_weight = []
first_conv_bias = []
normal_weight = []
normal_bias = []
bn = []
conv_cnt = 0
bn_cnt = 0
for m in self.modules():
if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Conv1d):
ps = list(m.parameters())
conv_cnt += 1
if conv_cnt == 1:
first_conv_weight.append(ps[0])
if len(ps) == 2:
first_conv_bias.append(ps[1])
else:
normal_weight.append(ps[0])
if len(ps) == 2:
normal_bias.append(ps[1])
elif isinstance(m, torch.nn.Linear):
ps = list(m.parameters())
normal_weight.append(ps[0])
if len(ps) == 2:
normal_bias.append(ps[1])
elif isinstance(m, torch.nn.BatchNorm1d):
bn.extend(list(m.parameters()))
elif isinstance(m, torch.nn.BatchNorm2d):
bn_cnt += 1
# later BN's are frozen
if not self._enable_pbn or bn_cnt == 1:
bn.extend(list(m.parameters()))
elif len(m._modules) == 0:
if len(list(m.parameters())) > 0:
raise ValueError("New atomic module type: {}. Need to give it a learning policy".format(type(m)))
return [
{'params': first_conv_weight, 'lr_mult':1, 'decay_mult': 1,'name': "first_conv_weight"},
{'params': first_conv_bias, 'lr_mult': 2, 'decay_mult': 0,'name': "first_conv_bias"},
{'params': normal_weight, 'lr_mult': 1, 'decay_mult': 1,'name': "normal_weight"},
{'params': normal_bias, 'lr_mult': 2, 'decay_mult': 0,'name': "normal_bias"},
{'params': bn, 'lr_mult': 1, 'decay_mult': 0,'name': "BN scale/shift"},
]
def forward(self, input):
#input size (Batch_size,num_segments*new_length*3,W,H)
#Total num of channels (3 in RGB and 15 in RGBDiff)
sample_len = 3* self.new_length
if self.modality == 'RGBDiff':
#Get RGBDiff Tensor in shape of (Batch_size,Num of segments,Number of frames(eg:5),3,H,W)
input = self.extract_rgbDiff(input)
#Reshape the input to be in shape of (Batchsize*num_segments,new_lenghth*3,H,W) to suit the model.
input = input.view((-1, sample_len) + input.size()[-2:])
#print('input after reshape: ',input.size())
FProp = self.base_model(input)
#If the dropout layer is added to the model then there's one more layer to propagate through
if self.dropout > 0:
FProp = self.new_fc(FProp)
#Propagate through softmax too
if not self.before_softmax:
FProp = self.softmax(FProp)
#the output of CNN will be (Num of segments*Batchsize, Num. of classes)
#So, we have to reshape it to (Batchsize,Num_Segment,Num_classes)
FProp = FProp.view((-1, self.num_segments) + FProp.size()[1:])
#Consensus on dim=1 [Consensus the output on segment dim]
output = self.consensus(FProp)
return output.squeeze(1)
@property
def crop_size(self):
return self.input_size
@property
def scale_size(self):
return self.input_size * 256 // 224
def get_augmentation(self):
if self.modality == 'RGB':
return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75, .66]),
GroupRandomHorizontalFlip(is_flow=False)])
if self.modality == 'RGBDiff':
return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75]),
GroupRandomHorizontalFlip(is_flow=False)])