Bouding boxes predictions are concentrated in left top corner #382

toschilt · 2021-06-19T12:27:22Z

Hi! I have a problem with the SSD300 implementation. I'm using a dataset of 1000 images and I'm using 750 of them to train and 250 to be the validation set. My dataset has only 1 positive class.

My training code is the following:

`img_height = 300
img_width = 300
img_channels = 3
mean_color = [123, 117, 104]
swap_channels = [2, 1, 0]
n_classes = 1
scales_pascal = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05]
scales_coco = [0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05]
scales = scales_pascal
aspect_ratios = [[1.0, 2.0, 0.5],
[1.0, 2.0, 0.5, 3.0, 1.0/3.0],
[1.0, 2.0, 0.5, 3.0, 1.0/3.0],
[1.0, 2.0, 0.5, 3.0, 1.0/3.0],
[1.0, 2.0, 0.5],
[1.0, 2.0, 0.5]]
two_boxes_for_ar1 = True
steps = [8, 16, 32, 64, 100, 300]
offsets = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
clip_boxes = False
variances = [0.1, 0.1, 0.2, 0.2]
normalize_coords = True

K.clear_session()

model = ssd_300(image_size=(img_height, img_width, img_channels),
n_classes=n_classes,
mode='training',
l2_regularization=0.0005,
scales=scales,
aspect_ratios_per_layer=aspect_ratios,
two_boxes_for_ar1=two_boxes_for_ar1,
steps=steps,
offsets=offsets,
clip_boxes=clip_boxes,
variances=variances,
normalize_coords=normalize_coords,
subtract_mean=mean_color,
swap_channels=swap_channels)

weights_path = 'VGG_weights/VGG_ILSVRC_16_layers_fc_reduced.h5'
model.load_weights(weights_path, by_name=True)
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0)
model.compile(optimizer=adam, loss=ssd_loss.compute_loss)

train_dataset = DataGenerator(load_images_into_memory=True, hdf5_dataset_path=None)
val_dataset = DataGenerator(load_images_into_memory=True, hdf5_dataset_path=None)

myDataSet_train_images_dir = 'myDatasets/Training/JPEGImages/'
myDataSet_train_annotations_dir = 'myDatasets/Training/Annotations/'
myDataSet_trainval_image_set_filename = 'myDatasets/Training/ImageSets/Main/default.txt'

myDataSet_test_images_dir = 'myDatasets/Testing/JPEGImages/'
myDataSet_test_annotations_dir = 'myDatasets/Testing/Annotations/'
myDataSet_test_image_set_filename = 'myDatasets/Testing/ImageSets/Main/default.txt'

classes = ['background',
'Plant']

train_dataset.parse_xml(images_dirs=[myDataSet_train_images_dir],
image_set_filenames=[myDataSet_trainval_image_set_filename],
annotations_dirs=[myDataSet_train_annotations_dir],
classes=classes,
include_classes='all',
exclude_truncated=False,
exclude_difficult=False,
ret=False)

val_dataset.parse_xml(images_dirs=[myDataSet_test_images_dir],
image_set_filenames=[myDataSet_test_image_set_filename],
annotations_dirs=[myDataSet_test_annotations_dir],
classes=classes,
include_classes='all',
exclude_truncated=False,
exclude_difficult=True,
ret=False)

batch_size = 5

ssd_data_augmentation = SSDDataAugmentation(img_height=img_height,
img_width=img_width,
background=mean_color)

convert_to_3_channels = ConvertTo3Channels()
resize = Resize(height=img_height, width=img_width)

predictor_sizes = [model.get_layer('conv4_3_norm_mbox_conf').output_shape[1:3],
model.get_layer('fc7_mbox_conf').output_shape[1:3],
model.get_layer('conv6_2_mbox_conf').output_shape[1:3],
model.get_layer('conv7_2_mbox_conf').output_shape[1:3],
model.get_layer('conv8_2_mbox_conf').output_shape[1:3],
model.get_layer('conv9_2_mbox_conf').output_shape[1:3]]

ssd_input_encoder = SSDInputEncoder(img_height=img_height,
img_width=img_width,
n_classes=n_classes,
predictor_sizes=predictor_sizes,
scales=scales,
aspect_ratios_per_layer=aspect_ratios,
two_boxes_for_ar1=two_boxes_for_ar1,
steps=steps,
offsets=offsets,
clip_boxes=clip_boxes,
variances=variances,
matching_type='multi',
pos_iou_threshold=0.5,
neg_iou_limit=0.5,
normalize_coords=normalize_coords)

train_generator = train_dataset.generate(batch_size=batch_size,
shuffle=True,
transformations=[ssd_data_augmentation],
label_encoder=ssd_input_encoder,
returns={'processed_images',
'encoded_labels'},
keep_images_without_gt=False)

val_generator = val_dataset.generate(batch_size=batch_size,
shuffle=False,
transformations=[convert_to_3_channels,
resize],
label_encoder=ssd_input_encoder,
returns={'processed_images',
'encoded_labels'},
keep_images_without_gt=False)

train_dataset_size = train_dataset.get_dataset_size()
val_dataset_size = val_dataset.get_dataset_size()

print("Number of images in the training dataset:\t{:>6}".format(train_dataset_size))
print("Number of images in the validation dataset:\t{:>6}".format(val_dataset_size))

def lr_schedule(epoch):
if epoch < 300:
return 0.0001
elif epoch < 450:
return 0.00001
else:
return 0.000001

model_checkpoint = ModelCheckpoint(filepath='ssd300_pascal_07+12_epoch-{epoch:02d}_loss-{loss:.4f}_val_loss-{val_loss:.4f}.h5',
monitor='val_loss',
verbose=1,
save_best_only=True,
save_weights_only=False,
mode='auto',
period=1)

csv_logger = CSVLogger(filename='ssd300_pascal_07+12_training_log.csv',
separator=',',
append=True)

learning_rate_scheduler = LearningRateScheduler(schedule=lr_schedule,
verbose=1)

terminate_on_nan = TerminateOnNaN()

callbacks = [model_checkpoint,
csv_logger,
learning_rate_scheduler,
terminate_on_nan]

initial_epoch = 0
final_epoch = 1000
steps_per_epoch = 1000

history = model.fit_generator(generator=train_generator,
steps_per_epoch=steps_per_epoch,
epochs=final_epoch,
callbacks=callbacks,
validation_data=val_generator,
validation_steps=ceil(val_dataset_size/batch_size),
initial_epoch=initial_epoch)`

The inference code is the following:
`img_height = 300
img_width = 300

model_path = 'ssd300_pascal_07+12_epoch-180_loss-3.5966_val_loss-3.3306.h5'
ssd_loss = SSDLoss(neg_pos_ratio=3, n_neg_min=0, alpha=1.0)

K.clear_session()

model = load_model(model_path, custom_objects={'AnchorBoxes': AnchorBoxes,
'L2Normalization': L2Normalization,
'DecodeDetections': DecodeDetections,
'compute_loss': ssd_loss.compute_loss})

orig_images = []
input_images = []

img_path = 'myDatasets/Testing/JPEGImages/scene00371.png'

orig_images.append(imread(img_path))
img = image.load_img(img_path, target_size=(img_height, img_width))
img = image.img_to_array(img)
input_images.append(img)
input_images = np.array(input_images)

y_pred = model.predict(input_images)

confidence_threshold = 0.25

y_pred_thresh = [y_pred[k][y_pred[k,:,1] > confidence_threshold] for k in range(y_pred.shape[0])]

np.set_printoptions(precision=2, suppress=True, linewidth=90, threshold=sys.maxsize)
print("Predicted boxes:\n")
print('class conf xmin ymin xmax ymax')
print(y_pred_thresh[0])

colors = plt.cm.hsv(np.linspace(0, 1, 2)).tolist()
classes = ['background',
'Plant']

plt.figure(figsize=(20,12))
plt.imshow(orig_images[0])

current_axis = plt.gca()

for box in y_pred_thresh[0]:
xmin = box[2] * orig_images[0].shape[1] / img_width
ymin = box[3] * orig_images[0].shape[0] / img_height
xmax = box[4] * orig_images[0].shape[1] / img_width
ymax = box[5] * orig_images[0].shape[0] / img_height
color = colors[round(box[0])]
label = '{}: {:.2f}'.format(classes[round(box[0])], box[1])
current_axis.add_patch(plt.Rectangle((xmin, ymin), xmax-xmin, ymax-ymin, color=color, fill=False, linewidth=2))
current_axis.text(xmin, ymin, label, size='x-large', color='white', bbox={'facecolor':color, 'alpha':1.0})

plt.show()`

That's the output from the network:

An example from the image annotation XML file:

<annotation>
<folder/>
<filename>scene00058.png</filename>
<source>
<database>Unknown</database>
<annotation>Unknown</annotation>
<image>Unknown</image>
</source>
<size>
<width>1280</width>
<height>720</height>
<depth/>
</size>
<segmented>0</segmented>
<object>
<name>Plant</name>
<occluded>0</occluded>
<bndbox>
<xmin>33.02</xmin>
<ymin>13.54</ymin>
<xmax>105.77000000000001</xmax>
<ymax>610.24</ymax>
</bndbox>
<attributes>
<attribute>
<name>track_id</name>
<value>1</value>
</attribute>
<attribute>
<name>keyframe</name>
<value>True</value>
</attribute>
</attributes>
</object>
<object>
<name>Plant</name>
<occluded>0</occluded>
<bndbox>
<xmin>189.63</xmin>
<ymin>10.2</ymin>
<xmax>262.43</xmax>
<ymax>545.78</ymax>
</bndbox>
<attributes>
<attribute>
<name>track_id</name>
<value>2</value>
</attribute>
<attribute>
<name>keyframe</name>
<value>True</value>
</attribute>
</attributes>
</object>
<object>

I really don't know why this is not working and I made just parameter tune changes from the original code.

The text was updated successfully, but these errors were encountered:

mtchibozo · 2021-07-09T00:33:12Z

I think this is related to a difference between training vs. inference modes.

The predictions are correct, but they are scaled between 0 and 1 on both x and y axes. On the other hand the image has kept its original shape. As a result, all the predictions are close to the (0, 0) point on the image which is the top left corner. You should make sure the predictions and the image are on the same scale.

As a workaround you can decode the predictions from y_pred with decode_detections instead of doing it manually.
Make sure to import decode_detections first:

from ssd_encoder_decoder.ssd_output_decoder import decode_detections

Here's how you can do this:

y_pred = model.predict(input_images)
confidence_threshold = 0.25
y_pred_decoded = decode_detections(y_pred, confidence_tresh=confidence_threshold, iou_thresh=0.5, top_k=1000, normalize_coords=True, img_height=orig_images[0].shape[0], img_width=orig_images[0].shape[1])
np.set_printoptions(precision=2, suppress=True, linewidth=90, threshold=sys.maxsize)
print("Predicted boxes:\n")
print('class conf xmin ymin xmax ymax')
print(y_pred_decoded[0])

colors = plt.cm.hsv(np.linspace(0, 1, 2)).tolist()
classes = ['background', 'Plant']
plt.figure(figsize=(20,12))
plt.imshow(orig_images[0])

current_axis = plt.gca()
for box in y_pred_decoded[0]:
xmin = box[2]
ymin = box[3]
xmax = box[4]
ymax = box[5]
color = colors[round(box[0])]
label = '{}: {:.2f}'.format(classes[round(box[0])], box[1])
current_axis.add_patch(plt.Rectangle((xmin, ymin), xmax-xmin, ymax-ymin, color=color, fill=False, linewidth=2))
current_axis.text(xmin, ymin, label, size='x-large', color='white', bbox={'facecolor':color, 'alpha':1.0})
plt.show()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Bouding boxes predictions are concentrated in left top corner #382

Bouding boxes predictions are concentrated in left top corner #382

toschilt commented Jun 19, 2021 •

edited

Loading

mtchibozo commented Jul 9, 2021 •

edited

Loading

Bouding boxes predictions are concentrated in left top corner #382

Bouding boxes predictions are concentrated in left top corner #382

Comments

toschilt commented Jun 19, 2021 • edited Loading

mtchibozo commented Jul 9, 2021 • edited Loading

toschilt commented Jun 19, 2021 •

edited

Loading

mtchibozo commented Jul 9, 2021 •

edited

Loading