DocsDevice

Device API Reference

Device management and GPU acceleration, providing abstraction over CUDA, OpenCL, and CPU backends via ArrayFire.

Device Types

enum class DeviceType {
    CPU,        // CPU backend
    CUDA,       // NVIDIA CUDA
    OpenCL,     // OpenCL (AMD, Intel, etc.)
    Metal,      // Apple Metal (macOS/iOS)
    Auto        // Automatic selection (best available)
};

enum class Backend {
    CPU,
    CUDA,
    OpenCL,
    Metal
};

Device Class

class CYXWIZ_API Device {
public:
    // Get device information
    static DeviceInfo GetInfo(int device_id = -1);
    static std::vector<DeviceInfo> GetAllDevices();

    // Device selection
    static void SetDevice(int device_id);
    static void SetDevice(DeviceType type, int device_index = 0);
    static int GetDevice();
    static DeviceType GetDeviceType();

    // Backend management
    static void SetBackend(Backend backend);
    static Backend GetBackend();
    static std::vector<Backend> GetAvailableBackends();
    static bool IsBackendAvailable(Backend backend);

    // Memory management
    static size_t GetMemoryUsed();
    static size_t GetMemoryTotal();
    static size_t GetMemoryAvailable();
    static void FreeMemory();
    static void GarbageCollect();

    // Synchronization
    static void Sync();
    static void SyncAll();

    // Multi-GPU
    static int GetDeviceCount();
    static int GetDeviceCount(DeviceType type);
    static void EnablePeerAccess(int device_from, int device_to);
    static bool CanAccessPeer(int device_from, int device_to);
};

DeviceInfo Structure

struct DeviceInfo {
    int device_id;
    std::string name;
    DeviceType type;
    Backend backend;

    // Memory
    size_t total_memory;
    size_t available_memory;
    size_t used_memory;

    // Compute capabilities
    int compute_units;           // CUDA cores / OpenCL compute units
    int max_work_group_size;
    int max_threads_per_block;
    int warp_size;               // 32 for NVIDIA, 64 for AMD

    // Version info
    std::string driver_version;
    int compute_capability_major;  // CUDA only
    int compute_capability_minor;  // CUDA only

    // Features
    bool supports_double;
    bool supports_half;
    bool supports_unified_memory;

    // Performance estimate
    double compute_score;  // TFLOPS estimate
};

Usage Examples

Device Query

#include <cyxwiz/device.h>

using namespace cyxwiz;

// Get all available devices
auto devices = Device::GetAllDevices();
for (const auto& dev : devices) {
    std::cout << "Device " << dev.device_id << ": " << dev.name << std::endl;
    std::cout << "  Type: " << (dev.type == DeviceType::CUDA ? "CUDA" : "Other") << std::endl;
    std::cout << "  Memory: " << dev.total_memory / (1024*1024) << " MB" << std::endl;
    std::cout << "  Compute Units: " << dev.compute_units << std::endl;
}

// Get current device info
DeviceInfo info = Device::GetInfo();
std::cout << "Current device: " << info.name << std::endl;

Device Selection

// Select by device ID
Device::SetDevice(0);  // First GPU

// Select by type
Device::SetDevice(DeviceType::CUDA, 0);   // First CUDA device
Device::SetDevice(DeviceType::OpenCL, 1); // Second OpenCL device
Device::SetDevice(DeviceType::CPU);       // CPU backend

// Auto-select best available
Device::SetDevice(DeviceType::Auto);

// Check what's available
if (Device::IsBackendAvailable(Backend::CUDA)) {
    Device::SetBackend(Backend::CUDA);
}

Memory Management

// Check memory usage
size_t used = Device::GetMemoryUsed();
size_t total = Device::GetMemoryTotal();
size_t available = Device::GetMemoryAvailable();

std::cout << "Memory: " << used / (1024*1024) << " / "
          << total / (1024*1024) << " MB" << std::endl;

// Force garbage collection
Device::GarbageCollect();

// Free cached memory
Device::FreeMemory();

Multi-GPU Support

// Get device count
int num_gpus = Device::GetDeviceCount(DeviceType::CUDA);
std::cout << "Found " << num_gpus << " CUDA devices" << std::endl;

// Enable peer-to-peer access between GPUs
if (num_gpus >= 2 && Device::CanAccessPeer(0, 1)) {
    Device::EnablePeerAccess(0, 1);
    Device::EnablePeerAccess(1, 0);
}

// Use different GPUs for different operations
Device::SetDevice(0);
Tensor a = Randn({1000, 1000});

Device::SetDevice(1);
Tensor b = Randn({1000, 1000});

// Copy between devices
Tensor a_on_1 = a.ToDevice(DeviceType::CUDA);  // Copy to current device

Device Context Manager

class DeviceScope {
public:
    DeviceScope(int device_id);
    DeviceScope(DeviceType type, int index = 0);
    ~DeviceScope();

    // Disable copy
    DeviceScope(const DeviceScope&) = delete;
    DeviceScope& operator=(const DeviceScope&) = delete;

private:
    int previous_device_;
};

Usage

// Temporary device switch
{
    DeviceScope scope(1);  // Switch to device 1
    Tensor t = Randn({1000, 1000});  // Created on device 1
    // Operations here run on device 1
}  // Automatically switches back to previous device

Stream Management (Advanced)

// Create multiple streams for concurrent operations
Stream stream1;
Stream stream2;

stream1.SetCurrent();
Tensor a = Randn({1000, 1000});
Tensor c = a.MatMul(a);  // Runs on stream1

stream2.SetCurrent();
Tensor b = Randn({1000, 1000});
Tensor d = b.MatMul(b);  // Runs on stream2 concurrently

// Wait for both
stream1.Synchronize();
stream2.Synchronize();

Python Bindings

import pycyxwiz as cyx

# Device info
devices = cyx.device.get_all_devices()
for dev in devices:
    print(f"Device {dev.id}: {dev.name}")
    print(f"  Memory: {dev.total_memory // (1024**2)} MB")

# Device selection
cyx.device.set_device(0)
cyx.device.set_device('cuda', 0)
cyx.device.set_device('cpu')

# Memory
used = cyx.device.memory_used()
total = cyx.device.memory_total()
print(f"Memory: {used / total * 100:.1f}% used")

# Context manager
with cyx.device.DeviceScope(1):
    # Operations here run on device 1
    tensor = cyx.randn([1000, 1000])

# Check capabilities
if cyx.device.is_backend_available('cuda'):
    cyx.device.set_backend('cuda')

Performance Tips

Device Selection

Use CUDA for NVIDIA GPUs
Use OpenCL for AMD/Intel
Fall back to CPU for debugging

Memory Management

Monitor memory usage
Clear cache periodically
Use pinned memory for transfers
Pre-allocate tensors

Multi-GPU

Enable peer access
Balance workload evenly
Minimize GPU-to-GPU transfers

Error Handling

try {
    Device::SetDevice(DeviceType::CUDA, 0);
} catch (const DeviceException& e) {
    // Fall back to CPU
    Device::SetDevice(DeviceType::CPU);
}