distributed - Module

`enum Backend`

Source: distributed.joule:18

`struct ProcessGroup`

Process group for distributed communication

Source: distributed.joule:30

`fn init_from_env() -> Result<Self, DistributedError>`

Initialize process group from environment

Source: distributed.joule:42

`fn init_with(`

Initialize with explicit parameters

Source: distributed.joule:86

`fn init(&mut self) -> Result<(), DistributedError>`

Initialize the process group

Source: distributed.joule:108

`fn init_nccl(&self) -> Result<(), DistributedError>`

Source: distributed.joule:120

`fn ncclGetUniqueId(id: *mut u8) -> i32;`

Source: distributed.joule:124

`fn ncclCommInitRank(`

Source: distributed.joule:125

`fn init_gloo(&self) -> Result<(), DistributedError>`

Source: distributed.joule:143

`fn init_mpi(&self) -> Result<(), DistributedError>`

Source: distributed.joule:148

`fn MPI_Init(argc: mut i32, argv: mut mut mut i8) -> i32;`

Source: distributed.joule:152

`fn MPI_Comm_rank(comm: i32, rank: *mut i32) -> i32;`

Source: distributed.joule:153

`fn MPI_Comm_size(comm: i32, size: *mut i32) -> i32;`

Source: distributed.joule:154

`fn init_tcp(&self) -> Result<(), DistributedError>`

Source: distributed.joule:167

`fn rank(&self) -> usize`

Get current rank

Source: distributed.joule:187

`fn world_size(&self) -> usize`

Get world size

Source: distributed.joule:192

`fn local_rank(&self) -> usize`

Get local rank (on this node)

Source: distributed.joule:197

`fn is_master(&self) -> bool`

Check if this is the master process

Source: distributed.joule:202

`fn barrier(&self) -> Result<(), DistributedError>`

Source: distributed.joule:208

`fn MPI_Barrier(comm: i32) -> i32;`

Source: distributed.joule:220

`fn all_reduce(tensor: &mut Tensor, op: ReduceOp, group: &ProcessGroup) -> Result<(), DistributedError>`

Source: distributed.joule:238

`enum ReduceOp`

Source: distributed.joule:249

`fn all_reduce_nccl(tensor: &mut Tensor, op: ReduceOp) -> Result<(), DistributedError>`

Source: distributed.joule:257

`fn all_reduce_mpi(tensor: &mut Tensor, op: ReduceOp) -> Result<(), DistributedError>`

Source: distributed.joule:265

`fn all_reduce_ring(tensor: &mut Tensor, op: ReduceOp, group: &ProcessGroup) -> Result<(), DistributedError>`

Ring all-reduce (pure Joule implementation)

Source: distributed.joule:274

`fn apply_reduce_op(a: &Tensor, b: &Tensor, op: ReduceOp) -> Tensor`

Source: distributed.joule:314

`fn send_recv(`

Source: distributed.joule:324

`fn broadcast(tensor: &mut Tensor, src: usize, group: &ProcessGroup) -> Result<(), DistributedError>`

Source: distributed.joule:346

`fn broadcast_tree(tensor: &mut Tensor, src: usize, group: &ProcessGroup) -> Result<(), DistributedError>`

Source: distributed.joule:373

`fn gather(`

Source: distributed.joule:388

`fn scatter(`

Source: distributed.joule:417

`fn all_gather(tensor: &Tensor, group: &ProcessGroup) -> Result<Vec<Tensor>, DistributedError>`

Source: distributed.joule:442

`fn reduce_scatter(`

Source: distributed.joule:458

`struct DistributedDataParallel`

Distributed Data Parallel wrapper

Source: distributed.joule:476

`fn new(module: M, process_group: ProcessGroup) -> Self`

Wrap module for distributed training

Source: distributed.joule:485

`fn bucket_cap_mb(mut self, cap: f64) -> Self`

Set bucket size for gradient bucketing

Source: distributed.joule:497

`fn find_unused_parameters(mut self) -> Self`

Enable detection of unused parameters

Source: distributed.joule:503

`fn sync_parameters(&mut self) -> Result<(), DistributedError>`

Source: distributed.joule:510

`fn sync_gradients(&mut self) -> Result<(), DistributedError>`

Source: distributed.joule:519

`fn module(&self) -> &M`

Get inner module reference

Source: distributed.joule:535

`fn module_mut(&mut self) -> &mut M`

Get mutable inner module reference

Source: distributed.joule:540

`fn forward(&self, input: &Tensor) -> Tensor`

Source: distributed.joule:546

`fn parameters(&self) -> Vec<&Tensor>`

Source: distributed.joule:550

`fn parameters_mut(&mut self) -> Vec<&mut Tensor>`

Source: distributed.joule:554

`fn train(&mut self, mode: bool)`

Source: distributed.joule:558

`fn training(&self) -> bool`

Source: distributed.joule:562

`fn to(&mut self, device: Device)`

Source: distributed.joule:566

`struct PipelineStage`

Pipeline parallelism stage

Source: distributed.joule:576

`fn new(module: M, stage_id: usize, num_stages: usize, num_microbatches: usize) -> Self`

Source: distributed.joule:584

`fn forward_pipeline(&self, inputs: Vec<Tensor>, group: &ProcessGroup) -> Vec<Tensor>`

Source: distributed.joule:595

`struct TensorParallel`

Tensor parallel for large layers

Source: distributed.joule:623

`fn new(module: M, tp_rank: usize, tp_size: usize, split_dim: i64) -> Self`

Source: distributed.joule:631

`fn forward(&self, input: &Tensor, group: &ProcessGroup) -> Result<Tensor, DistributedError>`

Source: distributed.joule:642

`enum DistributedError`

Source: distributed.joule:667

`fn from(e: std::io::Error) -> Self`

Source: distributed.joule:676

`fn launch(`

Launch distributed training

Source: distributed.joule:686

`fn get_device(group: &ProcessGroup) -> Device`

Get optimal device for this rank

Source: distributed.joule:715

`struct DistributedContext`

Distributed training context

Source: distributed.joule:720

`fn init() -> Result<Self, DistributedError>`

Source: distributed.joule:726

`fn print(&self, msg: &str)`

Print only from master

Source: distributed.joule:734

`fn barrier(&self) -> Result<(), DistributedError>`

Barrier synchronization

Source: distributed.joule:741

enum Backend

struct ProcessGroup

fn init_from_env() -> Result<Self, DistributedError>

fn init_with(

fn init(&mut self) -> Result<(), DistributedError>

fn init_nccl(&self) -> Result<(), DistributedError>

fn ncclGetUniqueId(id: *mut u8) -> i32;

fn ncclCommInitRank(

fn init_gloo(&self) -> Result<(), DistributedError>

fn init_mpi(&self) -> Result<(), DistributedError>

fn MPI_Init(argc: *mut i32, argv: *mut *mut *mut i8) -> i32;

fn MPI_Comm_rank(comm: i32, rank: *mut i32) -> i32;

fn MPI_Comm_size(comm: i32, size: *mut i32) -> i32;

fn init_tcp(&self) -> Result<(), DistributedError>

fn rank(&self) -> usize

fn world_size(&self) -> usize

fn local_rank(&self) -> usize

fn is_master(&self) -> bool

fn barrier(&self) -> Result<(), DistributedError>

fn MPI_Barrier(comm: i32) -> i32;

fn all_reduce(tensor: &mut Tensor, op: ReduceOp, group: &ProcessGroup) -> Result<(), DistributedError>

enum ReduceOp

fn all_reduce_nccl(tensor: &mut Tensor, op: ReduceOp) -> Result<(), DistributedError>

fn all_reduce_mpi(tensor: &mut Tensor, op: ReduceOp) -> Result<(), DistributedError>

fn all_reduce_ring(tensor: &mut Tensor, op: ReduceOp, group: &ProcessGroup) -> Result<(), DistributedError>

fn apply_reduce_op(a: &Tensor, b: &Tensor, op: ReduceOp) -> Tensor

fn send_recv(

fn broadcast(tensor: &mut Tensor, src: usize, group: &ProcessGroup) -> Result<(), DistributedError>

fn broadcast_tree(tensor: &mut Tensor, src: usize, group: &ProcessGroup) -> Result<(), DistributedError>

fn gather(

fn scatter(

fn all_gather(tensor: &Tensor, group: &ProcessGroup) -> Result<Vec<Tensor>, DistributedError>

fn reduce_scatter(

struct DistributedDataParallel

fn new(module: M, process_group: ProcessGroup) -> Self

fn bucket_cap_mb(mut self, cap: f64) -> Self

fn find_unused_parameters(mut self) -> Self

fn sync_parameters(&mut self) -> Result<(), DistributedError>

fn sync_gradients(&mut self) -> Result<(), DistributedError>

fn module(&self) -> &M

fn module_mut(&mut self) -> &mut M

fn forward(&self, input: &Tensor) -> Tensor

fn parameters(&self) -> Vec<&Tensor>

fn parameters_mut(&mut self) -> Vec<&mut Tensor>

fn train(&mut self, mode: bool)

fn training(&self) -> bool

fn to(&mut self, device: Device)

struct PipelineStage

fn new(module: M, stage_id: usize, num_stages: usize, num_microbatches: usize) -> Self

fn forward_pipeline(&self, inputs: Vec<Tensor>, group: &ProcessGroup) -> Vec<Tensor>

struct TensorParallel

fn new(module: M, tp_rank: usize, tp_size: usize, split_dim: i64) -> Self

fn forward(&self, input: &Tensor, group: &ProcessGroup) -> Result<Tensor, DistributedError>

enum DistributedError

fn from(e: std::io::Error) -> Self

fn launch(

fn get_device(group: &ProcessGroup) -> Device

struct DistributedContext

fn init() -> Result<Self, DistributedError>

fn print(&self, msg: &str)

fn barrier(&self) -> Result<(), DistributedError>

`enum Backend`

`struct ProcessGroup`

`fn init_from_env() -> Result<Self, DistributedError>`

`fn init_with(`

`fn init(&mut self) -> Result<(), DistributedError>`

`fn init_nccl(&self) -> Result<(), DistributedError>`

`fn ncclGetUniqueId(id: *mut u8) -> i32;`

`fn ncclCommInitRank(`

`fn init_gloo(&self) -> Result<(), DistributedError>`

`fn init_mpi(&self) -> Result<(), DistributedError>`

`fn MPI_Init(argc: mut i32, argv: mut mut mut i8) -> i32;`

`fn MPI_Comm_rank(comm: i32, rank: *mut i32) -> i32;`

`fn MPI_Comm_size(comm: i32, size: *mut i32) -> i32;`

`fn init_tcp(&self) -> Result<(), DistributedError>`

`fn rank(&self) -> usize`

`fn world_size(&self) -> usize`

`fn local_rank(&self) -> usize`

`fn is_master(&self) -> bool`

`fn barrier(&self) -> Result<(), DistributedError>`

`fn MPI_Barrier(comm: i32) -> i32;`

`fn all_reduce(tensor: &mut Tensor, op: ReduceOp, group: &ProcessGroup) -> Result<(), DistributedError>`

`enum ReduceOp`

`fn all_reduce_nccl(tensor: &mut Tensor, op: ReduceOp) -> Result<(), DistributedError>`

`fn all_reduce_mpi(tensor: &mut Tensor, op: ReduceOp) -> Result<(), DistributedError>`

`fn all_reduce_ring(tensor: &mut Tensor, op: ReduceOp, group: &ProcessGroup) -> Result<(), DistributedError>`

`fn apply_reduce_op(a: &Tensor, b: &Tensor, op: ReduceOp) -> Tensor`

`fn send_recv(`

`fn broadcast(tensor: &mut Tensor, src: usize, group: &ProcessGroup) -> Result<(), DistributedError>`

`fn broadcast_tree(tensor: &mut Tensor, src: usize, group: &ProcessGroup) -> Result<(), DistributedError>`

`fn gather(`

`fn scatter(`

`fn all_gather(tensor: &Tensor, group: &ProcessGroup) -> Result<Vec<Tensor>, DistributedError>`

`fn reduce_scatter(`

`struct DistributedDataParallel`

`fn new(module: M, process_group: ProcessGroup) -> Self`

`fn bucket_cap_mb(mut self, cap: f64) -> Self`

`fn find_unused_parameters(mut self) -> Self`

`fn sync_parameters(&mut self) -> Result<(), DistributedError>`

`fn sync_gradients(&mut self) -> Result<(), DistributedError>`

`fn module(&self) -> &M`

`fn module_mut(&mut self) -> &mut M`

`fn forward(&self, input: &Tensor) -> Tensor`

`fn parameters(&self) -> Vec<&Tensor>`

`fn parameters_mut(&mut self) -> Vec<&mut Tensor>`

`fn train(&mut self, mode: bool)`

`fn training(&self) -> bool`

`fn to(&mut self, device: Device)`

`struct PipelineStage`

`fn new(module: M, stage_id: usize, num_stages: usize, num_microbatches: usize) -> Self`

`fn forward_pipeline(&self, inputs: Vec<Tensor>, group: &ProcessGroup) -> Vec<Tensor>`

`struct TensorParallel`

`fn new(module: M, tp_rank: usize, tp_size: usize, split_dim: i64) -> Self`

`fn forward(&self, input: &Tensor, group: &ProcessGroup) -> Result<Tensor, DistributedError>`

`enum DistributedError`

`fn from(e: std::io::Error) -> Self`

`fn launch(`

`fn get_device(group: &ProcessGroup) -> Device`

`struct DistributedContext`

`fn init() -> Result<Self, DistributedError>`

`fn print(&self, msg: &str)`

`fn barrier(&self) -> Result<(), DistributedError>`