← Back

Module distributed

enum Backend

Source: distributed.joule:18

struct ProcessGroup

Process group for distributed communication

Source: distributed.joule:30

fn init_from_env() -> Result<Self, DistributedError>

Initialize process group from environment

Source: distributed.joule:42

fn init_with(

Initialize with explicit parameters

Source: distributed.joule:86

fn init(&mut self) -> Result<(), DistributedError>

Initialize the process group

Source: distributed.joule:108

fn init_nccl(&self) -> Result<(), DistributedError>

Source: distributed.joule:120

fn ncclGetUniqueId(id: *mut u8) -> i32;

Source: distributed.joule:124

fn ncclCommInitRank(

Source: distributed.joule:125

fn init_gloo(&self) -> Result<(), DistributedError>

Source: distributed.joule:143

fn init_mpi(&self) -> Result<(), DistributedError>

Source: distributed.joule:148

fn MPI_Init(argc: *mut i32, argv: *mut *mut *mut i8) -> i32;

Source: distributed.joule:152

fn MPI_Comm_rank(comm: i32, rank: *mut i32) -> i32;

Source: distributed.joule:153

fn MPI_Comm_size(comm: i32, size: *mut i32) -> i32;

Source: distributed.joule:154

fn init_tcp(&self) -> Result<(), DistributedError>

Source: distributed.joule:167

fn rank(&self) -> usize

Get current rank

Source: distributed.joule:187

fn world_size(&self) -> usize

Get world size

Source: distributed.joule:192

fn local_rank(&self) -> usize

Get local rank (on this node)

Source: distributed.joule:197

fn is_master(&self) -> bool

Check if this is the master process

Source: distributed.joule:202

fn barrier(&self) -> Result<(), DistributedError>

Source: distributed.joule:208

fn MPI_Barrier(comm: i32) -> i32;

Source: distributed.joule:220

fn all_reduce(tensor: &mut Tensor, op: ReduceOp, group: &ProcessGroup) -> Result<(), DistributedError>

Source: distributed.joule:238

enum ReduceOp

Source: distributed.joule:249

fn all_reduce_nccl(tensor: &mut Tensor, op: ReduceOp) -> Result<(), DistributedError>

Source: distributed.joule:257

fn all_reduce_mpi(tensor: &mut Tensor, op: ReduceOp) -> Result<(), DistributedError>

Source: distributed.joule:265

fn all_reduce_ring(tensor: &mut Tensor, op: ReduceOp, group: &ProcessGroup) -> Result<(), DistributedError>

Ring all-reduce (pure Joule implementation)

Source: distributed.joule:274

fn apply_reduce_op(a: &Tensor, b: &Tensor, op: ReduceOp) -> Tensor

Source: distributed.joule:314

fn send_recv(

Source: distributed.joule:324

fn broadcast(tensor: &mut Tensor, src: usize, group: &ProcessGroup) -> Result<(), DistributedError>

Source: distributed.joule:346

fn broadcast_tree(tensor: &mut Tensor, src: usize, group: &ProcessGroup) -> Result<(), DistributedError>

Source: distributed.joule:373

fn gather(

Source: distributed.joule:388

fn scatter(

Source: distributed.joule:417

fn all_gather(tensor: &Tensor, group: &ProcessGroup) -> Result<Vec<Tensor>, DistributedError>

Source: distributed.joule:442

fn reduce_scatter(

Source: distributed.joule:458

struct DistributedDataParallel

Distributed Data Parallel wrapper

Source: distributed.joule:476

fn new(module: M, process_group: ProcessGroup) -> Self

Wrap module for distributed training

Source: distributed.joule:485

fn bucket_cap_mb(mut self, cap: f64) -> Self

Set bucket size for gradient bucketing

Source: distributed.joule:497

fn find_unused_parameters(mut self) -> Self

Enable detection of unused parameters

Source: distributed.joule:503

fn sync_parameters(&mut self) -> Result<(), DistributedError>

Source: distributed.joule:510

fn sync_gradients(&mut self) -> Result<(), DistributedError>

Source: distributed.joule:519

fn module(&self) -> &M

Get inner module reference

Source: distributed.joule:535

fn module_mut(&mut self) -> &mut M

Get mutable inner module reference

Source: distributed.joule:540

fn forward(&self, input: &Tensor) -> Tensor

Source: distributed.joule:546

fn parameters(&self) -> Vec<&Tensor>

Source: distributed.joule:550

fn parameters_mut(&mut self) -> Vec<&mut Tensor>

Source: distributed.joule:554

fn train(&mut self, mode: bool)

Source: distributed.joule:558

fn training(&self) -> bool

Source: distributed.joule:562

fn to(&mut self, device: Device)

Source: distributed.joule:566

struct PipelineStage

Pipeline parallelism stage

Source: distributed.joule:576

fn new(module: M, stage_id: usize, num_stages: usize, num_microbatches: usize) -> Self

Source: distributed.joule:584

fn forward_pipeline(&self, inputs: Vec<Tensor>, group: &ProcessGroup) -> Vec<Tensor>

Source: distributed.joule:595

struct TensorParallel

Tensor parallel for large layers

Source: distributed.joule:623

fn new(module: M, tp_rank: usize, tp_size: usize, split_dim: i64) -> Self

Source: distributed.joule:631

fn forward(&self, input: &Tensor, group: &ProcessGroup) -> Result<Tensor, DistributedError>

Source: distributed.joule:642

enum DistributedError

Source: distributed.joule:667

fn from(e: std::io::Error) -> Self

Source: distributed.joule:676

fn launch(

Launch distributed training

Source: distributed.joule:686

fn get_device(group: &ProcessGroup) -> Device

Get optimal device for this rank

Source: distributed.joule:715

struct DistributedContext

Distributed training context

Source: distributed.joule:720

fn init() -> Result<Self, DistributedError>

Source: distributed.joule:726

fn print(&self, msg: &str)

Print only from master

Source: distributed.joule:734

fn barrier(&self) -> Result<(), DistributedError>

Barrier synchronization

Source: distributed.joule:741