ReduceLrOnPlateau implementation issues #8927

riririririi · 2024-11-28T08:11:24Z

hi, i have been trying to implement a reducelronplateau scheduler which will allow tracking of loss history, and automatically update the learning rate whenever a loss plateau is detected.
i have made some changes to do the same, firstly i made a struct in darknet.h and initialised it in darknet.c, this struct defined as "reduce_lr_params" is initialised by the name "param_list" as below:

//darknet.h
//reduceonplateau struct def start
typedef struct reduce_lr_params{
    float *loss_history;
    int patience;
    float lr_factor;
    int loss_history_size;
    float best_loss;
    int cooldown_counter;
    int plateau_count;
    float threshold;
    const char *mode; // "min" or "max"
    int cooldown;
    float min_lr;
    float eps;
    int num_bad_epochs;
    char *threshold_mode; // "rel" or "abs"
    float current_loss;
    int curr_iter;  // New field to store the current iteration number
}reduce_lr_params;

extern reduce_lr_params param_list;
//reduceonplateau struct def end

//added policy
typedef enum {
    CONSTANT, STEP, EXP, POLY, STEPS, SIG, RANDOM, SGDR, REDUCEONPLATEAU
} learning_rate_policy;

//darknet.c
reduce_lr_params param_list;

//parser.c
void parse_net_options(list *options, network *net)
{
//reduceonplateau params start
    param_list.patience = option_find_int(options, "patience", 500);
	param_list.cooldown = option_find_int(options, "cooldown", 500);
	param_list.lr_factor = option_find_float(options, "lr_factor", 0.11);
	param_list.threshold = option_find_float(options, "threshold", 1e-3);
	const char *mode = option_find_str(options, "mode", "min");
	param_list.mode = strcmp(mode, "max") == 0 ? "max" : "min"; // default to "min" if not "max"
	param_list.min_lr = option_find_float(options, "min_lr", 0);
	param_list.eps = option_find_float(options, "eps", 1e-8);

	if(strcmp(option_find_str(options, "threshold_mode", "rel"), "abs") == 0){
	param_list.threshold_mode = "abs";
	}
	else{
	param_list.threshold_mode = "rel";
	}

    param_list.current_loss=FLT_MAX;
	param_list.best_loss = FLT_MAX;
	param_list.num_bad_epochs = 0;
	param_list.cooldown_counter = 0;
	param_list.loss_history = (float *)calloc(param_list.patience, sizeof(float));
	param_list.loss_history_size = 0;
//reduceonplateau params end

//detector.c
        if (ngpus == 1) {
            int wait_key = (dont_show) ? 0 : 1;
            loss = train_network_waitkey(net, train, wait_key);
            param_list.current_loss = loss;                         //reduceonplateu
            param_list.curr_iter = get_current_batch(net);          //reduceonplateu
        }

//network.c
//CUSTOM DEFINITION FOR POLICY = //reduceonplateau - START//
        case REDUCEONPLATEAU:
        {
            if (param_list.curr_iter==batch_num){return net.learning_rate;} 
            // Debug: print initial state
            printf("\n[ITERATION=%d] Current Loss: %f, Best Loss: %f, Bad Epochs: %d, Cooldown Counter: %d, Learning Rate: %f\n",
                batch_num, param_list.current_loss, param_list.best_loss, param_list.num_bad_epochs, param_list.cooldown_counter, net.learning_rate);
            fflush(stdout);

            // Shift loss history and add current loss
            if (param_list.loss_history_size < param_list.patience) {
                param_list.loss_history[param_list.loss_history_size++] = param_list.current_loss;
                // printf("[Loss History] Adding current loss: %f (Size: %d)\n", param_list.current_loss, param_list.loss_history_size);
            } else {
                for (i = 1; i < param_list.patience; ++i) {
                    param_list.loss_history[i - 1] = param_list.loss_history[i];
                }
                param_list.loss_history[param_list.patience - 1] = param_list.current_loss;
                // printf("[Loss History] Shifting and adding current loss: %f\n", param_list.current_loss);
            }

            // Determine if the current loss is better than the best loss
            int is_better = 0;
            if (strcmp(param_list.mode, "min") == 0) {
                if (param_list.threshold_mode == "rel") {
                    is_better = param_list.current_loss < param_list.best_loss * (1 - param_list.threshold);
                    // printf("[Rel Mode] Current Loss: %f, Best Loss: %f, Threshold: %f, Is Better: %d\n", param_list.current_loss, param_list.best_loss, param_list.threshold, is_better);
                } else {
                    is_better = param_list.current_loss < param_list.best_loss - param_list.threshold;
                    // printf("[Abs Mode] Current Loss: %f, Best Loss: %f, Threshold: %f, Is Better: %d\n", param_list.current_loss, param_list.best_loss, param_list.threshold, is_better);
                }
            }

            // Update best loss and reset bad epoch count if current loss is better
            if (is_better) {
                param_list.best_loss = param_list.current_loss;
                param_list.num_bad_epochs = 0;
                // printf("[Loss Update] Best Loss Updated: %f, Resetting Bad Epochs\n", param_list.best_loss);
            } else {
                param_list.num_bad_epochs++;
                // printf("[Loss Update] No improvement. Bad Epochs: %d\n", param_list.num_bad_epochs);
            }

            // Print current loss, best loss, number of bad epochs, and improvement status
            // printf("[Status] Current Loss: %f, Best Loss: %f, Bad Epochs: %d, Improvement: %d\n", param_list.current_loss, param_list.best_loss, param_list.num_bad_epochs, is_better);

            // Print the number of steps remaining in patience
            // printf("[Patience] Patience steps remaining: %d\n", param_list.patience - param_list.num_bad_epochs);

            // Adjust the learning rate if plateau detected and not in cooldown
            if (param_list.num_bad_epochs > param_list.patience && param_list.cooldown_counter <= 0) {
                // printf("[Debug] Adjusting Learning Rate: Bad Epochs: %d, Patience: %d, Cooldown Counter: %d\n", param_list.num_bad_epochs, param_list.patience, param_list.cooldown_counter);
                
                // printf("[Debug] Before Learning Rate Adjustment: %f\n", net.learning_rate);
                rate = net.learning_rate;
                rate *= param_list.lr_factor;
                printf("[Debug] Learning Rate Adjustment Triggered Automatically: %f\n", rate);
                fflush(stdout);
                
                if (rate < param_list.min_lr) {
                    rate = param_list.min_lr;
                    printf("[Debug] Learning Rate Set to Min LR: %f\n", rate);
                    fflush(stdout);
                }
                param_list.cooldown_counter = param_list.cooldown;
                param_list.num_bad_epochs = 0;

                // Print the learning rate when adjusted
                // printf("[Learning Rate] Adjusted Learning Rate: %f (Factor: %f, Min LR: %f)\n", rate, param_list.lr_factor, param_list.min_lr);
                // fflush(stdout);
            } else if (param_list.cooldown_counter > 0) {
                param_list.cooldown_counter--;
                // printf("[Cooldown] Cooldown in progress. Cooldown Counter: %d\n", param_list.cooldown_counter);
            }

            // Print the current cooldown counter
            // printf("[Cooldown] Current Cooldown Counter: %d\n\n", param_list.cooldown_counter);
            // printf("[DEBUG] Rate: %f", rate);
            return rate;
        }
        default:
            fprintf(stderr, "Policy is weird!\n");
            return net.learning_rate;
        }
}
//CUSTOM DEFINITION FOR POLICY = //reduceonplateau - END//

The code first checks if the current iteration (param_list.curr_iter) matches the batch_num, this is done because there are multiple places where the function is called within the same batch/iteration to print the learning rate.
if the param_list.curr_iter same as batch_num, we return the old learning rate as no adjustment is needed there.
If the first condition is not met (i.e., param_list.curr_iter != batch_num), it proceeds to print debug information and continue with updating the learning rate as per the defined policy.
now when i run the code, my logs are printed as below:

[ITERATION=5] Current Loss: 4.832144, Best Loss: 2.011736, Bad Epochs: 2, Cooldown Counter: 0, Learning Rate: 0.001200
5/200000: loss=3.3 hours left=198.6� 5: 3.334833, 2.580414 avg loss, 0.001200 rate, 2.471886 seconds, 160 images, 198.563651 hours left
Loaded: 0.000051 seconds

[ITERATION=6] Current Loss: 3.334833, Best Loss: 2.011736, Bad Epochs: 3, Cooldown Counter: 0, Learning Rate: 0.001200
[Debug] Learning Rate Adjustment Triggered Automatically: 0.000120
6/200000: loss=2.3 hours left=198.0� 6: 2.335879, 2.555961 avg loss, 0.001200 rate, 2.521076 seconds, 192 images, 197.951265 hours left
Loaded: 0.000051 seconds

[ITERATION=7] Current Loss: 2.335879, Best Loss: 2.011736, Bad Epochs: 0, Cooldown Counter: 3, Learning Rate: 0.001200
7/200000: loss=2.9 hours left=197.4� 7: 2.855765, 2.585941 avg loss, 0.001200 rate, 2.508097 seconds, 224 images, 197.372339 hours left
Loaded: 0.000046 seconds

[ITERATION=8] Current Loss: 2.855765, Best Loss: 2.011736, Bad Epochs: 1, Cooldown Counter: 2, Learning Rate: 0.001200
8/200000: loss=8.0 hours left=196.8� 8: 8.023098, 3.129657 avg loss, 0.001200 rate, 2.524368 seconds, 256 images, 196.791985 hours left
Loaded: 0.000037 seconds

[ITERATION=9] Current Loss: 8.023098, Best Loss: 2.011736, Bad Epochs: 2, Cooldown Counter: 1, Learning Rate: 0.001200
9/200000: loss=6.2 hours left=196.2� 9: 6.158772, 3.432569 avg loss, 0.001200 rate, 2.354538 seconds, 288 images, 196.226465 hours left
Loaded: 0.000093 seconds

[ITERATION=10] Current Loss: 6.158772, Best Loss: 2.011736, Bad Epochs: 3, Cooldown Counter: 0, Learning Rate: 0.001200
[Debug] Learning Rate Adjustment Triggered Automatically: 0.000120
10/200000: loss=7.4 hours left=195.6� 10: 7.444093, 3.833721 avg loss, 0.001200 rate, 2.464586 seconds, 320 images, 195.572243 hours left
Loaded: 0.000042 seconds

[ITERATION=11] Current Loss: 7.444093, Best Loss: 2.011736, Bad Epochs: 0, Cooldown Counter: 3, Learning Rate: 0.001200
11/200000: loss=9.6 hours left=195.0� 11: 9.565534, 4.406902 avg loss, 0.001200 rate, 2.513562 seconds, 352 images, 194.985720 hours left
Loaded: 0.000059 seconds

as seen above from the logs, plateau is detected at iteration 6, and the debug statement prints the value that the function returns, which is the reduce learning rate. but in the very next print statement (from detector.c by default) shows the learning rate as the old value.
somewhere in the code, the learning rate value is getting overwritten and i am not sure where as this is my first time working with C and programming in general.
this could be a feature request but as i had already tried implementing it and failed, i am putting it here.
please help this noob.

The text was updated successfully, but these errors were encountered:

riririririi · 2024-11-28T13:18:16Z

nvm i found out the issue and corrected my code.
in the old code, the learning rate was getting updated once, but as the net.learing_rate remains the same, when it is called again the value is reset.
to overcome this issue, i added a variable to the param_list called int update_count=0; to track the number of times a plateau was detected and changed the logic in network.c accordingly:

"""
//CUSTOM DEFINITION FOR POLICY = //reduceonplateau - START//
        case REDUCEONPLATEAU:
        {
            // 1. Check if current iteration is the same as batch number and update_count is 0
            if (param_list.curr_iter == batch_num && param_list.update_count == 0) {
                // printf("no");
                return net.learning_rate;  // No change to learning rate if no updates have been made
            }

            // 2. If current iteration is the same as batch number and update_count is not 0
            if (param_list.curr_iter == batch_num && param_list.update_count != 0) {
                rate = net.learning_rate;
                // Multiply learning rate by lr_factor for the number of times in update_count
                for (int r = 0; r < param_list.update_count; ++r) {
                    rate *= param_list.lr_factor;
                    // printf("Learning Rate updated: %d times, Current Rate: %f\n", param_list.update_count, rate);
                }
                return rate;  // Return the updated learning rate
            }

            // 3. If current iteration is one greater than batch number, start pushing loss to param_list and check for plateau
            // if (param_list.curr_iter == batch_num + 1) {
                // Shift loss history and add current loss
                if (param_list.loss_history_size < param_list.patience) {
                    param_list.loss_history[param_list.loss_history_size++] = param_list.current_loss;
                    // printf("working");
                } else {
                    for (i = 1; i < param_list.patience; ++i) {
                        param_list.loss_history[i - 1] = param_list.loss_history[i];
                    }
                    param_list.loss_history[param_list.patience - 1] = param_list.current_loss;
                }

                // Check for plateau based on loss improvement
                int is_better = 0;
                if (strcmp(param_list.mode, "min") == 0) {
                    if (param_list.threshold_mode == "rel") {
                        is_better = param_list.current_loss < param_list.best_loss * (1 - param_list.threshold);
                    } else {
                        is_better = param_list.current_loss < param_list.best_loss - param_list.threshold;
                    }
                }

                // Update best loss and reset bad epoch count if current loss is better
                if (is_better) {
                    param_list.best_loss = param_list.current_loss;
                    param_list.num_bad_epochs = 0;
                } else {
                    param_list.num_bad_epochs++;
                }

                // If a plateau is detected, increment update_count and adjust the learning rate
                if (param_list.num_bad_epochs > param_list.patience) {
                    param_list.update_count += 1;
                    printf("updated lr %d times", param_list.update_count);
                    fflush(stdout);
                    rate = net.learning_rate;

                    // Multiply learning rate by lr_factor for the number of times in update_count
                    for (int r = 0; r < param_list.update_count; ++r) {
                        rate *= param_list.lr_factor;
                        // printf("Plateau detected! Learning Rate updated: %d times, Current Rate: %f\n", param_list.update_count, rate);
                    }

                    // Ensure the learning rate doesn't go below the minimum learning rate
                    if (rate < param_list.min_lr) {
                        rate = param_list.min_lr;
                        // printf("Learning Rate clamped to min LR: %f\n", rate);
                    }

                    // Reset cooldown counter and bad epoch count
                    param_list.cooldown_counter = param_list.cooldown;
                    param_list.num_bad_epochs = 0;

                    return rate;  // Return the adjusted learning rate
                }
            // }

            // If cooldown counter is active, decrement it
            if (param_list.cooldown_counter > 0) {
                param_list.cooldown_counter--;
                // printf("Cooldown in progress. Cooldown Counter: %d\n", param_list.cooldown_counter);
            }
        }
        default:
            fprintf(stderr, "Policy is weird!\n");
            return net.learning_rate;
        }
}
//CUSTOM DEFINITION FOR POLICY = //reduceonplateau - END//
"""

it works now i am dumb sorry.
will create a merge request for the same when time permits.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

ReduceLrOnPlateau implementation issues #8927

ReduceLrOnPlateau implementation issues #8927

riririririi commented Nov 28, 2024

riririririi commented Nov 28, 2024 •

edited

Loading

ReduceLrOnPlateau implementation issues #8927

ReduceLrOnPlateau implementation issues #8927

Comments

riririririi commented Nov 28, 2024

riririririi commented Nov 28, 2024 • edited Loading

riririririi commented Nov 28, 2024 •

edited

Loading