CyxWiz LogoCyxWiz
DocsOptimizers

Optimizer API Reference

Optimization algorithms for training neural networks with GPU acceleration.

Base Optimizer Class

class CYXWIZ_API Optimizer {
public:
    Optimizer(double learning_rate = 0.001);
    virtual ~Optimizer() = default;

    // Main optimization step
    virtual void Step(std::vector<Tensor*>& parameters,
                      std::vector<Tensor*>& gradients) = 0;

    // Zero all tracked gradients
    virtual void ZeroGrad();

    // Learning rate management
    double LearningRate() const;
    void SetLearningRate(double lr);

    // State management (for checkpointing)
    virtual std::map<std::string, Tensor> State() const;
    virtual void LoadState(const std::map<std::string, Tensor>& state);

    // Step counter
    int64_t Steps() const;
    void ResetSteps();
};

SGD (Stochastic Gradient Descent)

SGD(double learning_rate = 0.01,
    double momentum = 0.0,
    double weight_decay = 0.0,
    bool nesterov = false);

Algorithm

v_t = momentum * v_{t-1} + grad
if nesterov:
    param = param - lr * (grad + momentum * v_t)
else:
    param = param - lr * v_t

if weight_decay > 0:
    param = param - lr * weight_decay * param

Usage

// Basic SGD
SGD sgd(0.01);

// SGD with momentum
SGD sgd_momentum(0.01, 0.9);

// SGD with momentum and weight decay
SGD sgd_full(0.01, 0.9, 1e-4, true);  // Nesterov momentum

// Training step
auto params = model.Parameters();
auto grads = model.Gradients();
sgd.Step(params, grads);

Adam

Adam(double learning_rate = 0.001,
     double beta1 = 0.9,
     double beta2 = 0.999,
     double epsilon = 1e-8,
     double weight_decay = 0.0,
     bool amsgrad = false);

Algorithm

m_t = beta1 * m_{t-1} + (1 - beta1) * grad
v_t = beta2 * v_{t-1} + (1 - beta2) * grad^2

# Bias correction
m_hat = m_t / (1 - beta1^t)
v_hat = v_t / (1 - beta2^t)

if amsgrad:
    v_max = max(v_max, v_hat)
    param = param - lr * m_hat / (sqrt(v_max) + epsilon)
else:
    param = param - lr * m_hat / (sqrt(v_hat) + epsilon)

Usage

// Default Adam
Adam adam(0.001);

// Adam with custom betas
Adam adam_custom(0.001, 0.9, 0.98, 1e-9);

// AdamW (Adam with decoupled weight decay)
Adam adamw(0.001, 0.9, 0.999, 1e-8, 0.01);

// AMSGrad variant
Adam amsgrad(0.001, 0.9, 0.999, 1e-8, 0.0, true);

RMSprop

RMSprop(double learning_rate = 0.01,
        double alpha = 0.99,
        double epsilon = 1e-8,
        double weight_decay = 0.0,
        double momentum = 0.0,
        bool centered = false);

Usage

// Basic RMSprop
RMSprop rmsprop(0.01);

// RMSprop with momentum
RMSprop rmsprop_mom(0.01, 0.99, 1e-8, 0.0, 0.9);

// Centered RMSprop
RMSprop rmsprop_centered(0.01, 0.99, 1e-8, 0.0, 0.0, true);

Learning Rate Schedulers

StepLR
StepLR(Optimizer* optimizer,
       int step_size,
       double gamma = 0.1);

// Decay by gamma every step_size epochs
Adam adam(0.001);
StepLR scheduler(&adam, 10, 0.1);
CosineAnnealingLR
CosineAnnealingLR(
    Optimizer* optimizer,
    int T_max,
    double eta_min = 0.0);

// lr = eta_min + 0.5 * (base_lr - eta_min)
//      * (1 + cos(pi * epoch / T_max))
WarmupScheduler
WarmupScheduler(
    Optimizer* optimizer,
    int warmup_steps,
    LRScheduler* after_warmup = nullptr);

// Linear warmup from 0 to base_lr
ReduceLROnPlateau
ReduceLROnPlateau(
    Optimizer* optimizer,
    std::string mode = "min",
    double factor = 0.1,
    int patience = 10);

// Reduce LR when metric stops improving

Python Bindings

import pycyxwiz as cyx

# SGD
sgd = cyx.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Adam
adam = cyx.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)

# AdamW
adamw = cyx.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)

# Training loop
for epoch in range(epochs):
    for batch in dataloader:
        optimizer.zero_grad()
        loss = model(batch)
        loss.backward()
        optimizer.step()

# Schedulers
scheduler = cyx.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)

for epoch in range(epochs):
    train()
    scheduler.step()

Optimizer Selection Guide

OptimizerBest ForKey Parameters
SGD+MomentumComputer vision, CNNslr=0.1, momentum=0.9
AdamGeneral purpose, NLPlr=0.001
AdamWTransformers, large modelslr=0.0001, wd=0.01
RMSpropRNNs, non-stationarylr=0.001

Best Practices

  1. Start with defaults: Adam 0.001, SGD 0.01-0.1
  2. Use warmup: Especially for large batch sizes
  3. Decay during training: Cosine or step decay
  4. Monitor loss: Reduce on plateau for fine-tuning
  5. Weight decay: Typical values 1e-4 to 1e-2