Fix early stopping in converter patching + fix lr warmup for all tasks (

#4131) * fix converter and early stopping + fix warmup epochs * fix linter * fix linter2 * aligned default patience=10 for all tasks * fix linte * fix unit tests * revert epoch to steps back, change templates * fix cls templates * fix unit test * revert rotated det back. * change schedule for classification * fix linter * update changelog
openvinotoolkit · Dec 4, 2024 · 5707bc5 · 5707bc5
1 parent c6e2952
commit 5707bc5
Show file tree

Hide file tree

Showing 85 changed files with 326 additions and 376 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -73,6 +73,8 @@ All notable changes to this project will be documented in this file.
   (<https://github.com/openvinotoolkit/training_extensions/pull/4035>)
 - Bump onnx to 1.17.0 to omit CVE-2024-5187
   (<https://github.com/openvinotoolkit/training_extensions/pull/4063>)
+- Decouple DinoV2 for semantic segmentation task
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4136>)
 
 ### Bug fixes
 
@@ -126,6 +128,8 @@ All notable changes to this project will be documented in this file.
   (<https://github.com/openvinotoolkit/training_extensions/pull/4107>)
 - Fix empty annotation in tiling
   (<https://github.com/openvinotoolkit/training_extensions/pull/4124>)
+- Fix patching early stopping in tools/converter.py, update headers in templates, change training schedule for classification
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4131>)
 - Fix tensor type compatibility in dynamic soft label assigner and RTMDet head
   (<https://github.com/openvinotoolkit/training_extensions/pull/4140>)
 

diff --git a/src/otx/algo/callbacks/adaptive_early_stopping.py b/src/otx/algo/callbacks/adaptive_early_stopping.py
@@ -20,7 +20,7 @@ def __init__(
         self,
         monitor: str,
         min_delta: float = 0.0,
-        patience: int = 3,
+        patience: int = 10,
         verbose: bool = False,
         mode: str = "min",
         strict: bool = True,

diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py
@@ -744,7 +744,7 @@ def lr_scheduler_step(self, scheduler: LRSchedulerTypeUnion, metric: Tensor) ->
             return super().lr_scheduler_step(scheduler=scheduler, metric=metric)
 
         if len(warmup_schedulers) != 1:
-            msg = "No more than two warmup schedulers coexist."
+            msg = "No more than one warmup schedulers coexist."
             raise RuntimeError(msg)
 
         warmup_scheduler = next(iter(warmup_schedulers))

diff --git a/src/otx/core/schedulers/warmup_schedulers.py b/src/otx/core/schedulers/warmup_schedulers.py
@@ -19,8 +19,9 @@ class LinearWarmupScheduler(LambdaLR):
     """Linear Warmup scheduler.
 
     Args:
-        num_warmup_steps: Learning rate will linearly increased during the period same as this number.
-        warmup_interval: If "epoch", count the number of steps for the warmup period.
+        optimizer (Optimizer): Optimizer to apply the scheduler.
+        num_warmup_steps (int): Learning rate will linearly increased during the period same as this number.
+        interval (Literal["step", "epoch"]): If "epoch", count the number of epochs for the warmup period.
             Otherwise, the iteration step will be the warmup period.
     """
 
@@ -55,7 +56,7 @@ class LinearWarmupSchedulerCallable:
         main_scheduler_callable: Callable to create a LR scheduler that will be mainly used.
         num_warmup_steps: Learning rate will linearly increased during the period same as this number.
             If it is less than equal to zero, do not create `LinearWarmupScheduler`.
-        warmup_interval: If "epoch", count the number of steps for the warmup period.
+        warmup_interval: If "epoch", count the number of epochs for the warmup period.
             Otherwise, the iteration step will be the warmup period.
         monitor: If given, override the main scheduler's `monitor` attribute.
     """

diff --git a/src/otx/recipe/_base_/train.yaml b/src/otx/recipe/_base_/train.yaml
@@ -39,6 +39,8 @@ callbacks:
     init_args:
       max_interval: 5
       decay: -0.025
+      min_earlystop_patience: 5
+      min_lrschedule_patience: 3
 logger:
   - class_path: lightning.pytorch.loggers.csv_logs.CSVLogger
     init_args:

diff --git a/src/otx/recipe/anomaly_classification/stfpm.yaml b/src/otx/recipe/anomaly_classification/stfpm.yaml
@@ -16,7 +16,7 @@ overrides:
   precision: 32
   max_epochs: 100
   callbacks:
-    - class_path: lightning.pytorch.callbacks.EarlyStopping
+    - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
       init_args:
         patience: 5
         mode: max

diff --git a/src/otx/recipe/anomaly_detection/stfpm.yaml b/src/otx/recipe/anomaly_detection/stfpm.yaml
@@ -21,7 +21,7 @@ overrides:
   precision: 32
   max_epochs: 100
   callbacks:
-    - class_path: lightning.pytorch.callbacks.EarlyStopping
+    - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
       init_args:
         patience: 5
         mode: max

diff --git a/src/otx/recipe/anomaly_segmentation/stfpm.yaml b/src/otx/recipe/anomaly_segmentation/stfpm.yaml
@@ -16,7 +16,7 @@ overrides:
   precision: 32
   max_epochs: 100
   callbacks:
-    - class_path: lightning.pytorch.callbacks.EarlyStopping
+    - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
       init_args:
         patience: 5
         mode: max

diff --git a/src/otx/recipe/classification/h_label_cls/deit_tiny.yaml b/src/otx/recipe/classification/h_label_cls/deit_tiny.yaml
@@ -10,12 +10,16 @@ model:
         weight_decay: 0.05
 
     scheduler:
-      class_path: lightning.pytorch.cli.ReduceLROnPlateau
+      class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
       init_args:
-        mode: max
-        factor: 0.5
-        patience: 1
-        monitor: val/accuracy
+        num_warmup_steps: 0
+        main_scheduler_callable:
+          class_path: lightning.pytorch.cli.ReduceLROnPlateau
+          init_args:
+            mode: max
+            factor: 0.5
+            patience: 3
+            monitor: val/accuracy
 
 engine:
   task: H_LABEL_CLS
@@ -26,11 +30,12 @@ callback_monitor: val/accuracy
 data: ../../_base_/data/classification.yaml
 overrides:
   max_epochs: 90
-  callbacks:
-    - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
-      init_args:
-        patience: 3
 
   data:
     task: H_LABEL_CLS
     data_format: datumaro
+
+  callbacks:
+    - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
+      init_args:
+        patience: 5
diff --git a/src/otx/recipe/classification/h_label_cls/efficientnet_b0.yaml b/src/otx/recipe/classification/h_label_cls/efficientnet_b0.yaml
@@ -11,12 +11,16 @@ model:
         weight_decay: 0.0001
 
     scheduler:
-      class_path: lightning.pytorch.cli.ReduceLROnPlateau
+      class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
       init_args:
-        mode: max
-        factor: 0.5
-        patience: 1
-        monitor: val/accuracy
+        num_warmup_steps: 0
+        main_scheduler_callable:
+          class_path: lightning.pytorch.cli.ReduceLROnPlateau
+          init_args:
+            mode: max
+            factor: 0.5
+            patience: 3
+            monitor: val/accuracy
 
 engine:
   task: H_LABEL_CLS
@@ -29,11 +33,12 @@ overrides:
   reset:
     - data.train_subset.transforms
 
-  max_epochs: 90
   callbacks:
     - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
       init_args:
-        patience: 3
+        patience: 5
+
+  max_epochs: 90
 
   data:
     task: H_LABEL_CLS

diff --git a/src/otx/recipe/classification/h_label_cls/efficientnet_v2.yaml b/src/otx/recipe/classification/h_label_cls/efficientnet_v2.yaml
@@ -10,6 +10,18 @@ model:
         momentum: 0.9
         weight_decay: 0.0001
 
+    scheduler:
+      class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
+      init_args:
+        num_warmup_steps: 0
+        main_scheduler_callable:
+          class_path: lightning.pytorch.cli.ReduceLROnPlateau
+          init_args:
+            mode: max
+            factor: 0.5
+            patience: 3
+            monitor: val/accuracy
+
 engine:
   task: H_LABEL_CLS
   device: auto
@@ -25,8 +37,11 @@ overrides:
   callbacks:
     - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
       init_args:
-        patience: 3
         warmup_iters: 750
+    - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
+      init_args:
+        patience: 5
+
   data:
     task: H_LABEL_CLS
     data_format: datumaro

diff --git a/src/otx/recipe/classification/h_label_cls/mobilenet_v3_large.yaml b/src/otx/recipe/classification/h_label_cls/mobilenet_v3_large.yaml
@@ -19,7 +19,7 @@ model:
           init_args:
             mode: max
             factor: 0.5
-            patience: 1
+            patience: 3
             monitor: val/accuracy
 
 engine:
@@ -31,10 +31,11 @@ callback_monitor: val/accuracy
 data: ../../_base_/data/classification.yaml
 overrides:
   max_epochs: 90
+
   callbacks:
     - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
       init_args:
-        patience: 3
+        patience: 5
 
   data:
     task: H_LABEL_CLS

diff --git a/src/otx/recipe/classification/h_label_cls/tv_efficientnet_b3.yaml b/src/otx/recipe/classification/h_label_cls/tv_efficientnet_b3.yaml
@@ -12,10 +12,14 @@ model:
         weight_decay: 0.0001
 
     scheduler:
-      class_path: torch.optim.lr_scheduler.CosineAnnealingLR
+      class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
       init_args:
-        T_max: 100000
-        eta_min: 0
+        num_warmup_steps: 0
+        main_scheduler_callable:
+          class_path: torch.optim.lr_scheduler.CosineAnnealingLR
+          init_args:
+            T_max: 100000
+            eta_min: 0
 
 engine:
   task: H_LABEL_CLS
@@ -29,10 +33,11 @@ overrides:
     - data.train_subset.transforms
 
   max_epochs: 90
+
   callbacks:
     - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
       init_args:
-        patience: 3
+        patience: 5
 
   data:
     task: H_LABEL_CLS

diff --git a/src/otx/recipe/classification/h_label_cls/tv_efficientnet_v2_l.yaml b/src/otx/recipe/classification/h_label_cls/tv_efficientnet_v2_l.yaml
@@ -12,10 +12,14 @@ model:
         weight_decay: 0.0001
 
     scheduler:
-      class_path: torch.optim.lr_scheduler.CosineAnnealingLR
+      class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
       init_args:
-        T_max: 100000
-        eta_min: 0
+        num_warmup_steps: 0
+        main_scheduler_callable:
+          class_path: torch.optim.lr_scheduler.CosineAnnealingLR
+          init_args:
+            T_max: 100000
+            eta_min: 0
 
 engine:
   task: H_LABEL_CLS
@@ -29,10 +33,11 @@ overrides:
     - data.train_subset.transforms
 
   max_epochs: 90
+
   callbacks:
     - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
       init_args:
-        patience: 3
+        patience: 5
 
   data:
     task: H_LABEL_CLS

diff --git a/src/otx/recipe/classification/h_label_cls/tv_mobilenet_v3_small.yaml b/src/otx/recipe/classification/h_label_cls/tv_mobilenet_v3_small.yaml
@@ -12,10 +12,14 @@ model:
         weight_decay: 0.0001
 
     scheduler:
-      class_path: torch.optim.lr_scheduler.CosineAnnealingLR
+      class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
       init_args:
-        T_max: 100000
-        eta_min: 0
+        num_warmup_steps: 0
+        main_scheduler_callable:
+          class_path: torch.optim.lr_scheduler.CosineAnnealingLR
+          init_args:
+            T_max: 100000
+            eta_min: 0
 
 engine:
   task: H_LABEL_CLS
@@ -29,10 +33,11 @@ overrides:
     - data.train_subset.transforms
 
   max_epochs: 90
+
   callbacks:
     - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
       init_args:
-        patience: 3
+        patience: 5
 
   data:
     task: H_LABEL_CLS

diff --git a/src/otx/recipe/classification/multi_class_cls/deit_tiny.yaml b/src/otx/recipe/classification/multi_class_cls/deit_tiny.yaml
@@ -12,12 +12,16 @@ model:
         weight_decay: 0.05
 
     scheduler:
-      class_path: lightning.pytorch.cli.ReduceLROnPlateau
+      class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
       init_args:
-        mode: max
-        factor: 0.5
-        patience: 1
-        monitor: val/accuracy
+        num_warmup_steps: 0
+        main_scheduler_callable:
+          class_path: lightning.pytorch.cli.ReduceLROnPlateau
+          init_args:
+            mode: max
+            factor: 0.5
+            patience: 3
+            monitor: val/accuracy
 
 engine:
   task: MULTI_CLASS_CLS
@@ -28,7 +32,8 @@ callback_monitor: val/accuracy
 data: ../../_base_/data/classification.yaml
 overrides:
   max_epochs: 90
+
   callbacks:
     - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
       init_args:
-        patience: 3
+        patience: 5
diff --git a/src/otx/recipe/classification/multi_class_cls/dino_v2.yaml b/src/otx/recipe/classification/multi_class_cls/dino_v2.yaml
@@ -11,12 +11,16 @@ model:
         weight_decay: 0.05
 
     scheduler:
-      class_path: lightning.pytorch.cli.ReduceLROnPlateau
+      class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
       init_args:
-        mode: max
-        factor: 0.5
-        patience: 1
-        monitor: val/accuracy
+        num_warmup_steps: 0
+        main_scheduler_callable:
+          class_path: lightning.pytorch.cli.ReduceLROnPlateau
+          init_args:
+            mode: max
+            factor: 0.5
+            patience: 3
+            monitor: val/accuracy
 
 engine:
   task: MULTI_CLASS_CLS
@@ -27,7 +31,8 @@ callback_monitor: val/accuracy
 data: ../../_base_/data/classification.yaml
 overrides:
   max_epochs: 90
+
   callbacks:
     - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
       init_args:
-        patience: 3
+        patience: 5