V2Dial/utils/scheduler.py

""" Scheduler Factory
Hacked together by / Copyright 2020 Ross Wightman
"""
from torch.optim import Optimizer
import math
from torch.optim.lr_scheduler import LambdaLR, _LRScheduler
import math


# class LinearWarmupStepLRScheduler:
#     def __init__(
#         self,
#         optimizer,
#         max_epoch,
#         min_lr,
#         init_lr,
#         decay_rate=1,
#         warmup_start_lr=-1,
#         warmup_steps=0,
#         **kwargs
#     ):
#         self.optimizer = optimizer

#         self.max_epoch = max_epoch
#         self.min_lr = min_lr

#         self.decay_rate = decay_rate

#         self.init_lr = init_lr
#         self.warmup_steps = warmup_steps
#         self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr

#     def step(self, cur_epoch, cur_step):
#         if cur_epoch == 0:
#             warmup_lr_schedule(
#                 step=cur_step,
#                 optimizer=self.optimizer,
#                 max_step=self.warmup_steps,
#                 init_lr=self.warmup_start_lr,
#                 max_lr=self.init_lr,
#             )
#         else:
#             step_lr_schedule(
#                 epoch=cur_epoch,
#                 optimizer=self.optimizer,
#                 init_lr=self.init_lr,
#                 min_lr=self.min_lr,
#                 decay_rate=self.decay_rate,
#             )


# class LinearWarmupCosineLRScheduler:
#     def __init__(
#         self,
#         optimizer,
#         max_epoch,
#         min_lr,
#         init_lr,
#         warmup_steps=0,
#         warmup_start_lr=-1,
#         **kwargs
#     ):
#         self.optimizer = optimizer

#         self.max_epoch = max_epoch
#         self.min_lr = min_lr

#         self.init_lr = init_lr
#         self.warmup_steps = warmup_steps
#         self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr

#     def step(self, cur_epoch, cur_step):
#         # assuming the warmup iters less than one epoch
#         if cur_epoch == 0:
#             warmup_lr_schedule(
#                 step=cur_step,
#                 optimizer=self.optimizer,
#                 max_step=self.warmup_steps,
#                 init_lr=self.warmup_start_lr,
#                 max_lr=self.init_lr,
#             )
#         else:
#             cosine_lr_schedule(
#                 epoch=cur_epoch,
#                 optimizer=self.optimizer,
#                 max_epoch=self.max_epoch,
#                 init_lr=self.init_lr,
#                 min_lr=self.min_lr,
#             )


# class ConstantLRScheduler:
#     def __init__(self, optimizer, init_lr, warmup_start_lr=-1, warmup_steps=0, **kwargs):
#         self.optimizer = optimizer
#         self.lr = init_lr
#         self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr
#         self.warmup_steps = warmup_steps

#     def step(self, cur_epoch, cur_step):
#         if cur_epoch == 0:
#             warmup_lr_schedule(
#                 step=cur_step,
#                 optimizer=self.optimizer,
#                 max_step=self.warmup_steps,
#                 init_lr=self.warmup_start_lr,
#                 max_lr=self.lr,
#             )
#         else:
#             for param_group in self.optimizer.param_groups:
#                 param_group["lr"] = self.lr


# schedulers = {
#     'constant_lr': ConstantLRScheduler,
#     'linear_warmup_cosine_lr': LinearWarmupCosineLRScheduler,
#     'linear_warmup_step_lr': LinearWarmupStepLRScheduler
# }


# def cosine_lr_schedule(optimizer, epoch, max_epoch, init_lr, min_lr):
#     """Decay the learning rate"""
#     lr = (init_lr - min_lr) * 0.5 * (
#         1.0 + math.cos(math.pi * epoch / max_epoch)
#     ) + min_lr
#     for param_group in optimizer.param_groups:
#         param_group["lr"] = lr


# def warmup_lr_schedule(optimizer, step, max_step, init_lr, max_lr):
#     """Warmup the learning rate"""
#     lr = min(max_lr, init_lr + (max_lr - init_lr) * step / max(max_step, 1))
#     for param_group in optimizer.param_groups:
#         param_group["lr"] = lr


# def step_lr_schedule(optimizer, epoch, init_lr, min_lr, decay_rate):
#     """Decay the learning rate"""
#     lr = max(min_lr, init_lr * (decay_rate**epoch))
#     for param_group in optimizer.param_groups:
#         param_group["lr"] = lr


# def create_scheduler(config, optimizer):
#     scheduler_cls = schedulers[config.get('scheduler', 'constant_lr')]
#     max_epoch = config.epochs
#     min_lr = config.min_lr
#     init_lr = config.lr
#     warmup_start_lr = config.get('warmup_lr', -1)
#     warmup_steps = config.get('warmup_steps', 0)

#     scheduler = scheduler_cls(
#         optimizer=optimizer,
#         max_epoch=max_epoch,
#         min_lr=min_lr,
#         init_lr=init_lr,
#         decay_rate=None,
#         warmup_start_lr=warmup_start_lr,
#         warmup_steps=warmup_steps
#     )

#     return scheduler


class WarmupLinearScheduleNonZero(_LRScheduler):
    """ Linear warmup and then linear decay.
        Linearly increases learning rate from 0 to max_lr over `warmup_steps` training steps.
        Linearly decreases learning rate linearly to min_lr over remaining `t_total - warmup_steps` steps.
    """
    def __init__(self, optimizer, warmup_steps, t_total, min_lr=1e-5, last_epoch=-1):
        self.warmup_steps = warmup_steps
        self.t_total = t_total
        self.min_lr = min_lr
        super(WarmupLinearScheduleNonZero, self).__init__(optimizer, last_epoch=last_epoch)

    def get_lr(self):
        step = self.last_epoch
        if step < self.warmup_steps:
            lr_factor = float(step) / float(max(1, self.warmup_steps))
        else:
            lr_factor = max(0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))

        return [base_lr * lr_factor if (base_lr * lr_factor) > self.min_lr else self.min_lr for base_lr in self.base_lrs]


def create_scheduler(config, optimizer):
    lr_scheduler = None
    if config['scheduler'] == 'cosine':
        lr_scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=config['num_warmup_steps'],
            num_training_steps=config['num_training_steps'],
            num_cycles=0.5,
            min_lr_multi=config['min_lr_multi']
        )
    elif config['scheduler'] == 'linear':
        lr_scheduler = WarmupLinearScheduleNonZero(
            optimizer,
            config['num_warmup_steps'],
            config['num_training_steps'],
            min_lr = config['min_lr']
        )
    return lr_scheduler


def get_cosine_schedule_with_warmup(
        optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int,
        num_cycles: float = 0.5, min_lr_multi: float = 0., last_epoch: int = -1
):
    """
    Modified from https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/optimization.py

    Create a schedule with a learning rate that decreases following the values of the cosine function between the
    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
    initial lr set in the optimizer.
    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
        num_cycles (`float`, *optional*, defaults to 0.5):
            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
            following a half-cosine).
        min_lr_multi (`float`, *optional*, defaults to 0):
            The minimum learning rate multiplier. Thus the minimum learning rate is base_lr * min_lr_multi.
        last_epoch (`int`, *optional*, defaults to -1):
            The index of the last epoch when resuming training.
    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return max(min_lr_multi, float(current_step) / float(max(1, num_warmup_steps)))
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return max(min_lr_multi, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))

    return LambdaLR(optimizer, lr_lambda, last_epoch)