DocsOptimizers
Optimizer API Reference
Optimization algorithms for training neural networks with GPU acceleration.
Base Optimizer Class
class CYXWIZ_API Optimizer {
public:
Optimizer(double learning_rate = 0.001);
virtual ~Optimizer() = default;
// Main optimization step
virtual void Step(std::vector<Tensor*>& parameters,
std::vector<Tensor*>& gradients) = 0;
// Zero all tracked gradients
virtual void ZeroGrad();
// Learning rate management
double LearningRate() const;
void SetLearningRate(double lr);
// State management (for checkpointing)
virtual std::map<std::string, Tensor> State() const;
virtual void LoadState(const std::map<std::string, Tensor>& state);
// Step counter
int64_t Steps() const;
void ResetSteps();
};SGD (Stochastic Gradient Descent)
SGD(double learning_rate = 0.01,
double momentum = 0.0,
double weight_decay = 0.0,
bool nesterov = false);Algorithm
v_t = momentum * v_{t-1} + grad
if nesterov:
param = param - lr * (grad + momentum * v_t)
else:
param = param - lr * v_t
if weight_decay > 0:
param = param - lr * weight_decay * paramUsage
// Basic SGD SGD sgd(0.01); // SGD with momentum SGD sgd_momentum(0.01, 0.9); // SGD with momentum and weight decay SGD sgd_full(0.01, 0.9, 1e-4, true); // Nesterov momentum // Training step auto params = model.Parameters(); auto grads = model.Gradients(); sgd.Step(params, grads);
Adam
Adam(double learning_rate = 0.001,
double beta1 = 0.9,
double beta2 = 0.999,
double epsilon = 1e-8,
double weight_decay = 0.0,
bool amsgrad = false);Algorithm
m_t = beta1 * m_{t-1} + (1 - beta1) * grad
v_t = beta2 * v_{t-1} + (1 - beta2) * grad^2
# Bias correction
m_hat = m_t / (1 - beta1^t)
v_hat = v_t / (1 - beta2^t)
if amsgrad:
v_max = max(v_max, v_hat)
param = param - lr * m_hat / (sqrt(v_max) + epsilon)
else:
param = param - lr * m_hat / (sqrt(v_hat) + epsilon)Usage
// Default Adam Adam adam(0.001); // Adam with custom betas Adam adam_custom(0.001, 0.9, 0.98, 1e-9); // AdamW (Adam with decoupled weight decay) Adam adamw(0.001, 0.9, 0.999, 1e-8, 0.01); // AMSGrad variant Adam amsgrad(0.001, 0.9, 0.999, 1e-8, 0.0, true);
RMSprop
RMSprop(double learning_rate = 0.01,
double alpha = 0.99,
double epsilon = 1e-8,
double weight_decay = 0.0,
double momentum = 0.0,
bool centered = false);Usage
// Basic RMSprop RMSprop rmsprop(0.01); // RMSprop with momentum RMSprop rmsprop_mom(0.01, 0.99, 1e-8, 0.0, 0.9); // Centered RMSprop RMSprop rmsprop_centered(0.01, 0.99, 1e-8, 0.0, 0.0, true);
Learning Rate Schedulers
StepLR
StepLR(Optimizer* optimizer,
int step_size,
double gamma = 0.1);
// Decay by gamma every step_size epochs
Adam adam(0.001);
StepLR scheduler(&adam, 10, 0.1);CosineAnnealingLR
CosineAnnealingLR(
Optimizer* optimizer,
int T_max,
double eta_min = 0.0);
// lr = eta_min + 0.5 * (base_lr - eta_min)
// * (1 + cos(pi * epoch / T_max))WarmupScheduler
WarmupScheduler(
Optimizer* optimizer,
int warmup_steps,
LRScheduler* after_warmup = nullptr);
// Linear warmup from 0 to base_lrReduceLROnPlateau
ReduceLROnPlateau(
Optimizer* optimizer,
std::string mode = "min",
double factor = 0.1,
int patience = 10);
// Reduce LR when metric stops improvingPython Bindings
import pycyxwiz as cyx
# SGD
sgd = cyx.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# Adam
adam = cyx.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)
# AdamW
adamw = cyx.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
# Training loop
for epoch in range(epochs):
for batch in dataloader:
optimizer.zero_grad()
loss = model(batch)
loss.backward()
optimizer.step()
# Schedulers
scheduler = cyx.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)
for epoch in range(epochs):
train()
scheduler.step()Optimizer Selection Guide
| Optimizer | Best For | Key Parameters |
|---|---|---|
| SGD+Momentum | Computer vision, CNNs | lr=0.1, momentum=0.9 |
| Adam | General purpose, NLP | lr=0.001 |
| AdamW | Transformers, large models | lr=0.0001, wd=0.01 |
| RMSprop | RNNs, non-stationary | lr=0.001 |
Best Practices
- Start with defaults: Adam 0.001, SGD 0.01-0.1
- Use warmup: Especially for large batch sizes
- Decay during training: Cosine or step decay
- Monitor loss: Reduce on plateau for fine-tuning
- Weight decay: Typical values 1e-4 to 1e-2