DocsDevice
Device API Reference
Device management and GPU acceleration, providing abstraction over CUDA, OpenCL, and CPU backends via ArrayFire.
Device Types
enum class DeviceType {
CPU, // CPU backend
CUDA, // NVIDIA CUDA
OpenCL, // OpenCL (AMD, Intel, etc.)
Metal, // Apple Metal (macOS/iOS)
Auto // Automatic selection (best available)
};
enum class Backend {
CPU,
CUDA,
OpenCL,
Metal
};Device Class
class CYXWIZ_API Device {
public:
// Get device information
static DeviceInfo GetInfo(int device_id = -1);
static std::vector<DeviceInfo> GetAllDevices();
// Device selection
static void SetDevice(int device_id);
static void SetDevice(DeviceType type, int device_index = 0);
static int GetDevice();
static DeviceType GetDeviceType();
// Backend management
static void SetBackend(Backend backend);
static Backend GetBackend();
static std::vector<Backend> GetAvailableBackends();
static bool IsBackendAvailable(Backend backend);
// Memory management
static size_t GetMemoryUsed();
static size_t GetMemoryTotal();
static size_t GetMemoryAvailable();
static void FreeMemory();
static void GarbageCollect();
// Synchronization
static void Sync();
static void SyncAll();
// Multi-GPU
static int GetDeviceCount();
static int GetDeviceCount(DeviceType type);
static void EnablePeerAccess(int device_from, int device_to);
static bool CanAccessPeer(int device_from, int device_to);
};DeviceInfo Structure
struct DeviceInfo {
int device_id;
std::string name;
DeviceType type;
Backend backend;
// Memory
size_t total_memory;
size_t available_memory;
size_t used_memory;
// Compute capabilities
int compute_units; // CUDA cores / OpenCL compute units
int max_work_group_size;
int max_threads_per_block;
int warp_size; // 32 for NVIDIA, 64 for AMD
// Version info
std::string driver_version;
int compute_capability_major; // CUDA only
int compute_capability_minor; // CUDA only
// Features
bool supports_double;
bool supports_half;
bool supports_unified_memory;
// Performance estimate
double compute_score; // TFLOPS estimate
};Usage Examples
Device Query
#include <cyxwiz/device.h>
using namespace cyxwiz;
// Get all available devices
auto devices = Device::GetAllDevices();
for (const auto& dev : devices) {
std::cout << "Device " << dev.device_id << ": " << dev.name << std::endl;
std::cout << " Type: " << (dev.type == DeviceType::CUDA ? "CUDA" : "Other") << std::endl;
std::cout << " Memory: " << dev.total_memory / (1024*1024) << " MB" << std::endl;
std::cout << " Compute Units: " << dev.compute_units << std::endl;
}
// Get current device info
DeviceInfo info = Device::GetInfo();
std::cout << "Current device: " << info.name << std::endl;Device Selection
// Select by device ID
Device::SetDevice(0); // First GPU
// Select by type
Device::SetDevice(DeviceType::CUDA, 0); // First CUDA device
Device::SetDevice(DeviceType::OpenCL, 1); // Second OpenCL device
Device::SetDevice(DeviceType::CPU); // CPU backend
// Auto-select best available
Device::SetDevice(DeviceType::Auto);
// Check what's available
if (Device::IsBackendAvailable(Backend::CUDA)) {
Device::SetBackend(Backend::CUDA);
}Memory Management
// Check memory usage
size_t used = Device::GetMemoryUsed();
size_t total = Device::GetMemoryTotal();
size_t available = Device::GetMemoryAvailable();
std::cout << "Memory: " << used / (1024*1024) << " / "
<< total / (1024*1024) << " MB" << std::endl;
// Force garbage collection
Device::GarbageCollect();
// Free cached memory
Device::FreeMemory();Multi-GPU Support
// Get device count
int num_gpus = Device::GetDeviceCount(DeviceType::CUDA);
std::cout << "Found " << num_gpus << " CUDA devices" << std::endl;
// Enable peer-to-peer access between GPUs
if (num_gpus >= 2 && Device::CanAccessPeer(0, 1)) {
Device::EnablePeerAccess(0, 1);
Device::EnablePeerAccess(1, 0);
}
// Use different GPUs for different operations
Device::SetDevice(0);
Tensor a = Randn({1000, 1000});
Device::SetDevice(1);
Tensor b = Randn({1000, 1000});
// Copy between devices
Tensor a_on_1 = a.ToDevice(DeviceType::CUDA); // Copy to current deviceDevice Context Manager
class DeviceScope {
public:
DeviceScope(int device_id);
DeviceScope(DeviceType type, int index = 0);
~DeviceScope();
// Disable copy
DeviceScope(const DeviceScope&) = delete;
DeviceScope& operator=(const DeviceScope&) = delete;
private:
int previous_device_;
};Usage
// Temporary device switch
{
DeviceScope scope(1); // Switch to device 1
Tensor t = Randn({1000, 1000}); // Created on device 1
// Operations here run on device 1
} // Automatically switches back to previous deviceStream Management (Advanced)
// Create multiple streams for concurrent operations
Stream stream1;
Stream stream2;
stream1.SetCurrent();
Tensor a = Randn({1000, 1000});
Tensor c = a.MatMul(a); // Runs on stream1
stream2.SetCurrent();
Tensor b = Randn({1000, 1000});
Tensor d = b.MatMul(b); // Runs on stream2 concurrently
// Wait for both
stream1.Synchronize();
stream2.Synchronize();Python Bindings
import pycyxwiz as cyx
# Device info
devices = cyx.device.get_all_devices()
for dev in devices:
print(f"Device {dev.id}: {dev.name}")
print(f" Memory: {dev.total_memory // (1024**2)} MB")
# Device selection
cyx.device.set_device(0)
cyx.device.set_device('cuda', 0)
cyx.device.set_device('cpu')
# Memory
used = cyx.device.memory_used()
total = cyx.device.memory_total()
print(f"Memory: {used / total * 100:.1f}% used")
# Context manager
with cyx.device.DeviceScope(1):
# Operations here run on device 1
tensor = cyx.randn([1000, 1000])
# Check capabilities
if cyx.device.is_backend_available('cuda'):
cyx.device.set_backend('cuda')Performance Tips
Device Selection
- Use CUDA for NVIDIA GPUs
- Use OpenCL for AMD/Intel
- Fall back to CPU for debugging
Memory Management
- Monitor memory usage
- Clear cache periodically
- Use pinned memory for transfers
- Pre-allocate tensors
Multi-GPU
- Enable peer access
- Balance workload evenly
- Minimize GPU-to-GPU transfers
Error Handling
try {
Device::SetDevice(DeviceType::CUDA, 0);
} catch (const DeviceException& e) {
// Fall back to CPU
Device::SetDevice(DeviceType::CPU);
}