Diff Coverage

Source File	Diff Coverage (%)	Missing Lines
hyper_parallel/core/optimizer/__init__.py	0.0%	86,97,104,108
hyper_parallel/core/optimizer/adamw.py	0.0%	41-42
hyper_parallel/core/optimizer/dtensor_compat.py	0.0%	22,24-25,27,29-30,33,36,38,42,45,47,50,57,59-60,62-66,68-70,72,74-76,79,81-84,86-87,89,93,95-98,101,103-107,110,112-116,119,121-125,128,130-131,133-134,138,140-144,147,149,152,155-156,160,169,171-178,181,183
hyper_parallel/core/optimizer/muon.py	0.0%	20,25-26,125,128-133,341-342,344,346,348,429,549
hyper_parallel/core/optimizer/optimizer.py	0.0%	20,29,181,190,196,205,212-216,218-219,222-229,231,233-236,239,242-243,245,247-248,250-252,254,256,258-259,262-270,272,274-280,282-283,285-287,289-290,294-295,297,303,305,307-308,310-311,313,315,321-322,325-334,336-337,341-345,349-350,354-355,358-359,361-363,366-369,371,373,375-376,381,388-389,392-393,429-432,447,481-483,485-486,489,560,745,747,904
hyper_parallel/core/optimizer/sharding_category.py	0.0%	25,276-278

hyper_parallel/core/optimizer/dtensor_compat.py

Provides lazy exports (PEP 562) for DTensor, DeviceMesh, Shard, Replicate, 
and StridedShard based on the detected backend ('torch' or 'hyper').
"""

from __future__ import annotations

import logging
from typing import Any, List

import torch.distributed._tensor as torch_dt

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Global backend flag
_DTENSOR_BACKEND: str = "hyper"  # "hyper" or "torch"


class _NeverMatch:
    """Safe fallback class that always returns False for ``isinstance()``."""
    __slots__ = ()


# Lazy-export cache
_LAZY_CACHE: dict = {}


def _invalidate_lazy_cache() -> None:
    """Clear the lazy-export cache to rebuild on next access."""
    _LAZY_CACHE.clear()


def detect_dtensor_backend(
        adamw_params: List[Any],
        muon_params: List[Any],
) -> str:
    """Detect and set the DTensor backend ('torch' or 'hyper') from parameter lists."""

) -> str:
    """Detect and set the DTensor backend ('torch' or 'hyper') from parameter lists."""
    global _DTENSOR_BACKEND  # pylint: disable=global-statement

    sample_param = _extract_first_param(muon_params)

    if sample_param is None:
        sample_param = _extract_first_param(adamw_params)

    if sample_param is None:
        logger.info("No parameters found for backend detection; defaulting to 'hyper'.")
        _DTENSOR_BACKEND = "hyper"
        _invalidate_lazy_cache()
        return _DTENSOR_BACKEND

    param_cls_module = type(sample_param).__module__
    if param_cls_module.startswith("torch.distributed"):
        _DTENSOR_BACKEND = "torch"
    else:
        _DTENSOR_BACKEND = "hyper"

    logger.info("Detected DTensor backend: '%s'.", _DTENSOR_BACKEND)
    _invalidate_lazy_cache()
    return _DTENSOR_BACKEND


def _extract_first_param(param_groups: List[Any]) -> Any:
    """Return the first parameter from a list of param groups, or None."""
    for group in param_groups:
        params = group.get("params", []) if isinstance(group, dict) else []
        for p in params:
            return p

    for p in param_groups:
        return p

    return None


# Accessor functions
def get_dtensor_cls():
    """Return the DTensor class for the active backend."""
    if _DTENSOR_BACKEND == "torch":
        return torch_dt.DTensor
    from hyper_parallel.core.dtensor.dtensor import DTensor  # pylint: disable=import-outside-toplevel
    return DTensor


def get_device_mesh_cls():
    """Return the DeviceMesh class for the active backend."""
    if _DTENSOR_BACKEND == "torch":
        from torch.distributed.device_mesh import DeviceMesh  # pylint: disable=import-outside-toplevel
        return DeviceMesh
    from hyper_parallel.core.dtensor.device_mesh import DeviceMesh  # pylint: disable=import-outside-toplevel
    return DeviceMesh


def get_shard_cls():
    """Return the Shard placement class for the active backend."""
    if _DTENSOR_BACKEND == "torch":
        from torch.distributed._tensor.placement_types import Shard  # pylint: disable=import-outside-toplevel
        return Shard
    from hyper_parallel.core.dtensor.placement_types import Shard  # pylint: disable=import-outside-toplevel
    return Shard


def get_replicate_cls():
    """Return the Replicate placement class for the active backend."""
    if _DTENSOR_BACKEND == "torch":
        from torch.distributed._tensor.placement_types import Replicate  # pylint: disable=import-outside-toplevel
        return Replicate
    from hyper_parallel.core.dtensor.placement_types import Replicate  # pylint: disable=import-outside-toplevel
    return Replicate


def get_strided_shard_cls():
    """Return the StridedShard placement class. Returns _NEVER_MATCH for 'torch'."""
    if _DTENSOR_BACKEND == "torch":
        return _NeverMatch

    from hyper_parallel.core.dtensor.placement_types import StridedShard  # pylint: disable=import-outside-toplevel
    return StridedShard


# DTensor union type resolver
def _import_hyper_dtensor():
    """Import hyper DTensor class; return torch DTensor as fallback."""
    try:
        from hyper_parallel.core.dtensor.dtensor import DTensor  # pylint: disable=import-outside-toplevel
        return DTensor
    except ImportError:
        return torch_dt.DTensor


def _resolve_dtensor_union():
    """Build ``torch_dt.DTensor | hyper_dt.DTensor`` on demand."""
    return torch_dt.DTensor | _import_hyper_dtensor()


def to_local_if_dtensor(tensor: Any) -> Any:
    """Return the local shard if `tensor` is a DTensor, otherwise return as-is."""
    # Use resolver directly for internal module lookups instead of lazy-loaded DTensor
    dtensor_type = _LAZY_CACHE.get("DTensor") or _resolve_dtensor_union()
    return tensor.to_local() if isinstance(tensor, dtensor_type) else tensor


# lazy exports
_LAZY_RESOLVERS = {
    "DTensor": _resolve_dtensor_union,
    "DeviceMesh": get_device_mesh_cls,
    "Shard": get_shard_cls,
    "Replicate": get_replicate_cls,

    "StridedShard": get_strided_shard_cls,
}


def __getattr__(name):  # type: ignore[no-untyped-def]  # pylint: disable=invalid-name
    """Resolve module attributes on first access."""
    resolver = _LAZY_RESOLVERS.get(name)
    if resolver is not None:
        value = _LAZY_CACHE.get(name)
        if value is None:
            value = resolver()
            _LAZY_CACHE[name] = value
        return value
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


def __dir__():  # type: ignore[no-untyped-def]  # pylint: disable=invalid-name
    """Include lazy-exported names in dir() for IDE autocomplete."""
    return list(globals().keys()) + list(_LAZY_RESOLVERS.keys())

hyper_parallel/core/optimizer/muon.py

"""Muon optimizer with HSDP shard-group-aware communication."""

import math
from collections import defaultdict
from typing import Any, Dict, List, Optional, Tuple, Union

import torch
import torch.distributed as dist

from hyper_parallel.core.optimizer.optimizer import AsyncReplicateBroadcaster, BaseDistributedOptimizer
from hyper_parallel.core.optimizer.dtensor_compat import to_local_if_dtensor
from hyper_parallel.core.optimizer.sharding_category import (
    HSDPGroupAssignment,
    fused_allgather_dtensor_params,
    build_owner_by_size,

            "momentum": momentum,
            "nesterov": nesterov,
            "ns_steps": ns_steps,
        }
        super().__init__(params, defaults, is_muon=True, hsdp_replica_count=hsdp_replica_count)

        self._group_dtensor_by_mesh()
        deduced_count = self._auto_deduce_replica_count()
        if deduced_count is None:
            self.hsdp_replica_count = None
        elif self.hsdp_replica_count is None:
            self.hsdp_replica_count = deduced_count
        self._split_replicate_groups()
        self._build_hsdp_batch()
        self._build_param_broadcast_info()
        self._classify_parameters_for_step()

                )

                # Fused batched apply — all params in the same sub_batch share
                # the same adjusted_lr, so we can use foreach ops.
                local_params = [to_local_if_dtensor(p.data) for p in sub_batch]
                local_updates = [updates_dict[p].view(lp.shape) for p, lp in zip(sub_batch, local_params)]

                if weight_decay != 0.0:
                    # pylint: disable=protected-access
                    torch._foreach_mul_(local_params, 1 - lr * weight_decay)
                # pylint: disable=protected-access
                torch._foreach_add_(local_params, local_updates, alpha=-adjusted_lr)

    def _gather_and_compute_shard_updates(
            self,
            valid_params: List[torch.nn.Parameter],

            buffer_cache: Optional[Dict] = None,
    ) -> None:
        """Process sharded params with greedy shard-group compute assignment."""
        platform = get_platform()
        device = torch.npu.current_device() if torch.npu.is_available() else torch.cuda.current_device()

        lr = group["lr"]
        weight_decay = group["weight_decay"]
        rms = group["matched_adamw_rms"]

        slice_sizes = []
        shapes_info = []

        for p in p_list:
            origin_shape = tuple(getattr(p, 'local_shape', None) or p.to_local().shape) if no_shard else tuple(p.shape)
            ns_input = ns_inputs[p].view(origin_shape)

            is_conv = False
            if len(origin_shape) == 2:

hyper_parallel/core/optimizer/optimizer.py

"""Base distributed optimizer and chain optimizer composition."""

from collections import defaultdict
import logging
from typing import Any, Dict, List, Optional, Tuple, Union

import torch
import torch.distributed as dist
from torch.distributed.checkpoint.state_dict import (

    StateDictOptions,
    get_optimizer_state_dict,
    set_optimizer_state_dict,
)
from hyper_parallel.core.optimizer.dtensor_compat import to_local_if_dtensor
from hyper_parallel.core.optimizer.sharding_category import (
    HSDPGroupAssignment,
    build_owner_by_size,
    get_multi_dim_logical_info,


    Provides fused hierarchical broadcast for parameters and optimizer states.
    """

    def __init__(
            self,
            params: Any,
            defaults: Dict[str, Any],
            is_muon: bool,

            hsdp_replica_count: Optional[Union[int, Tuple[int, ...]]] = None,
    ) -> None:
        super().__init__(params, defaults)
        self.is_muon = is_muon
        self.hsdp_replica_count = hsdp_replica_count
        self._param_to_broadcast_info: Dict[
            torch.nn.Parameter, Tuple[Tuple[int, ...], Tuple[dist.ProcessGroup, ...]]
        ] = {}

            torch.nn.Parameter, Tuple[Tuple[int, ...], Tuple[dist.ProcessGroup, ...]]
        ] = {}

        # Cache: (parent_ranks_tuple, sub_size) -> {sub_idx: sub_pg}
        self._split_sub_pg_cache: Dict[Tuple[Tuple[int, ...], int], Dict[int, dist.ProcessGroup]] = {}

    def _group_dtensor_by_mesh(self):
        """Group DTensor parameters by mesh topology and shard layout."""
        self._hsdp_grouping: Dict[int, Tuple[List, List]] = {}

        for group_key, group in enumerate(self.param_groups):
            no_comm_params, hsdp_groups = group_parameters_for_hsdp(group["params"])
            self._hsdp_grouping[group_key] = (no_comm_params, hsdp_groups)

    def _auto_deduce_replica_count(self) -> Optional[Union[int, Tuple[int, ...]]]:
        """Deduce hsdp_replica_count based on cluster topology.

        - Intra-node PGs: Full dedup (no split), high bandwidth makes broadcast cheap.
        - Inter-node PGs: Split at node boundaries to restrict communication domains

        - Intra-node PGs: Full dedup (no split), high bandwidth makes broadcast cheap.
        - Inter-node PGs: Split at node boundaries to restrict communication domains 
          within a single node, bypassing cross-node bottlenecks.
        """
        devices_per_node = 1
        if torch.npu.is_available():
            devices_per_node = torch.npu.device_count()
        elif torch.cuda.is_available():
            devices_per_node = torch.cuda.device_count()

        dedup_per_dim: Dict[int, int] = {}
        needs_split = False

        # group_key, (no_comm_params, hsdp_groups)
        for _, (_, hsdp_groups) in self._hsdp_grouping.items():
            for hsdp_group in hsdp_groups:
                for dim_idx, pg in enumerate(hsdp_group.replicate_pgs):
                    if pg is None:
                        continue
                    pg_size = dist.get_world_size(pg)
                    if pg_size <= 1:
                        continue

                    if pg_size > devices_per_node:
                        # Inter-node: Find largest divisor safe for node boundary
                        dedup = min(pg_size, devices_per_node)
                        while pg_size % dedup != 0:
                            dedup -= 1
                        needs_split = True
                    else:
                        # origin Inter-node
                        dedup = pg_size

                    # Enforce conservative (smallest) dedup across shared mesh axes
                    if dim_idx not in dedup_per_dim:
                        dedup_per_dim[dim_idx] = dedup
                    else:
                        dedup_per_dim[dim_idx] = min(dedup_per_dim[dim_idx], dedup)

        if not needs_split:
            return None

        sorted_dedups = [dedup_per_dim[k] for k in sorted(dedup_per_dim.keys())]
        if len(sorted_dedups) == 1:
            return sorted_dedups[0]

        return tuple(sorted_dedups)

    def _split_replicate_groups(self) -> None:
        """Split replicate ProcessGroups into smaller sub-groups based on hsdp_replica_count."""
        if self.hsdp_replica_count is None:
            return

        # Argument validation
        if isinstance(self.hsdp_replica_count, int):
            if self.hsdp_replica_count <= 0:
                raise ValueError(f"hsdp_replica_count must be positive, got {self.hsdp_replica_count}")
        elif isinstance(self.hsdp_replica_count, tuple):
            if not self.hsdp_replica_count:
                raise ValueError("hsdp_replica_count tuple must not be empty")
            for i, v in enumerate(self.hsdp_replica_count):
                if not isinstance(v, int) or v <= 0:
                    raise ValueError(f"hsdp_replica_count[{i}] must be positive, got {v}")
        else:
            raise TypeError(f"Unsupported hsdp_replica_count type: {type(self.hsdp_replica_count).__name__}")

        for group_key, (_, hsdp_groups) in self._hsdp_grouping.items():
            for hsdp_group in hsdp_groups:
                new_replicate_pgs: List[dist.ProcessGroup] = []
                for dim_idx, pg in enumerate(hsdp_group.replicate_pgs):
                    if pg is None:
                        new_replicate_pgs.append(pg)
                        continue

                    pg_size = dist.get_world_size(pg)
                    dedup_size = self._get_dedup_size_for_dim(dim_idx, pg_size)

                    if pg_size <= dedup_size:
                        new_replicate_pgs.append(pg)
                        continue

                    if pg_size % dedup_size != 0:
                        raise ValueError(
                            f"hsdp_replica_count {dedup_size} must evenly divide replicate group size {pg_size}"
                        )

                    sub_pg = self._get_or_create_sub_pg(pg, dedup_size)
                    new_replicate_pgs.append(sub_pg)

                    logger.info_rank0(
                        "[HSDP Split] group_key=%s, dim_idx=%s, original_size=%s, sub_size=%s",
                        group_key, dim_idx, pg_size, dedup_size
                    )

                        group_key, dim_idx, pg_size, dedup_size
                    )

                # replace new sub groups of hsdp groups
                hsdp_group.replicate_pgs = tuple(new_replicate_pgs)

    def _get_dedup_size_for_dim(self, dim_idx: int, pg_size: int) -> int:
        """Get the effective dedup size for a specific replicate dimension."""
        if isinstance(self.hsdp_replica_count, int):
            return self.hsdp_replica_count

        if dim_idx < len(self.hsdp_replica_count):
            return self.hsdp_replica_count[dim_idx]

        return pg_size

    def _get_or_create_sub_pg(
            self,
            parent_pg: dist.ProcessGroup,
            sub_size: int,
    ) -> dist.ProcessGroup:

            parent_pg: dist.ProcessGroup,
            sub_size: int,
    ) -> dist.ProcessGroup:
        """Retrieve or collectively create a synchronized sub-ProcessGroup."""
        local_parent_ranks = tuple(sorted(list(dist.get_process_group_ranks(parent_pg))))
        cache_key = (local_parent_ranks, sub_size)

        # Cache Hit Check
        if cache_key in self._split_sub_pg_cache:
            sub_pg_map = self._split_sub_pg_cache[cache_key]
            for sub_idx, sub_pg in sub_pg_map.items():
                if sub_pg is not None:
                    try:
                        dist.get_rank(group=sub_pg)
                        return sub_pg
                    except RuntimeError:
                        continue
            raise RuntimeError(f"Current rank not found in cached sub-groups for parent_pg={local_parent_ranks}")

        global_rank = dist.get_rank()
        world_size = dist.get_world_size()

        # Global Rendezvous via GPU all_gather_into_tensor (NCCL/HCCL fast-path).
        # Far more scalable than all_gather_object for large world_size.
        device = torch.npu.current_device() if torch.npu.is_available() else torch.cuda.current_device()
        num_parent_ranks = len(local_parent_ranks)
        local_tensor = torch.tensor(local_parent_ranks, dtype=torch.long, device=device)
        gathered_tensor = torch.empty((world_size, num_parent_ranks), dtype=torch.long, device=device)
        dist.all_gather_into_tensor(gathered_tensor.view(-1), local_tensor)

        # Deduplicate: each row is one rank's parent_ranks; convert to
        # sorted set of tuples for deterministic iteration order.
        gathered_cpu_list = gathered_tensor.cpu().tolist()
        unique_all_parent_ranks = sorted(set(
            tuple(row) for row in gathered_cpu_list
        ))

        sub_pg_map: Dict[int, dist.ProcessGroup] = {}
        my_sub_pg: Optional[dist.ProcessGroup] = None

        # Synchronized collective creation loop
        for parent_ranks in unique_all_parent_ranks:
            num_sub_groups = len(parent_ranks) // sub_size

            for sub_idx in range(num_sub_groups):
                sub_ranks = parent_ranks[sub_idx * sub_size: (sub_idx + 1) * sub_size]
                sub_pg = dist.new_group(sub_ranks)

                # Map results only if this parent ranks list matches the current rank's context
                if parent_ranks == local_parent_ranks:
                    if global_rank in sub_ranks:
                        my_sub_pg = sub_pg
                        sub_pg_map[sub_idx] = sub_pg
                    else:
                        sub_pg_map[sub_idx] = None

        self._split_sub_pg_cache[cache_key] = sub_pg_map

        if my_sub_pg is None:
            raise RuntimeError(
                f"Rank {global_rank} not found in any sub-group of parent_pg "
                f"with ranks {local_parent_ranks} and sub_size={sub_size}"
            )

        return my_sub_pg

    def _build_hsdp_batch(
            self,
            max_batch_numel: Optional[int] = None,

            self,
            max_batch_numel: Optional[int] = None,
    ) -> None:
        """Split HSDP groups into memory-capped batches for compute-broadcast overlap."""
        if max_batch_numel is None:
            broadcast_max_bytes = getattr(
                self, "replicate_broadcast_max_bytes", 512 * 1024 * 1024,
            )
            hsdp_size = self.hsdp_replica_count if self.hsdp_replica_count is not None else 1
            max_batch_numel = broadcast_max_bytes * hsdp_size if hsdp_size > 1 else float('inf')

        self._hsdp_batches: Dict[int, List[Dict]] = {}

        for group_key, (_, hsdp_groups) in self._hsdp_grouping.items():


                    # Soft limit: allow the bucket to slightly exceed the cap
                    # so that symmetric structures stay together and fragmentation
                    # is reduced (same approach as PyTorch FSDP/DDP bucketing).
                    if current_numel >= max_batch_numel:
                        sub_batches.append(current_batch)
                        current_batch = []
                        current_numel = 0

                if current_batch:
                    sub_batches.append(current_batch)

                })

            # log batch split info.
            total_sub_batches = sum(len(bg["sub_batches"]) for bg in batch_groups)
            logger.info_rank0(
                "[HSDP Batch] group_key=%s, num_hsdp_groups=%s, num_batch_groups=%s, "
                "total_sub_batches=%s, group_numels=%s, max_batch_numel=%s",
                group_key,
                len(hsdp_groups),

        # When hsdp_replica_count is set, remap coordinates into
        # sub-groups: each original coord maps to (coord % sub_size)
        # within its sub-group, and the effective group size shrinks.
        # Supports per-dimension control via Tuple[int, ...].
        if self.hsdp_replica_count is not None:
            if isinstance(self.hsdp_replica_count, int):
                dedup_per_dim = (self.hsdp_replica_count,) * len(replicate_group_ranks)
            else:
                dedup_per_dim = self.hsdp_replica_count
            replicate_group_ranks = tuple(
                r % dedup_per_dim[i] for i, r in enumerate(replicate_group_ranks)
            )
            replicate_sizes = tuple(
                min(s, dedup_per_dim[i]) for i, s in enumerate(replicate_sizes)
            )

        # Greedy owner assignment on this sub-batch's records.

                hsdp_group = bg["hsdp_group"]
                sub_batch_assigns: List[HSDPGroupAssignment] = []

                for sub_batch_entries in bg["sub_batches"]:
                    hsdp_assign = self._build_sub_batch_assignment(sub_batch_entries, hsdp_group)
                    if hsdp_assign is not None:
                        sub_batch_assigns.append(hsdp_assign)

                assignment_batch_groups.append({

        Args:
            target: "param" or "state".
            state_keys: State dict keys to broadcast when target="state".
        """
        device = torch.npu.current_device() if torch.npu.is_available() else torch.cuda.current_device()

        alignment = 512  # bytes
        rank_dtype_tensors = self._collect_broadcast_tensors(target, state_keys)

        for (src_coord, dtype, replicate_pgs), tensors in rank_dtype_tensors.items():
            if not tensors:

            tensors: List[torch.Tensor],
            async_op: bool = True,
    ) -> None:
        """Pack and broadcast tensors for one broadcast key."""
        device = torch.npu.current_device() if torch.npu.is_available() else torch.cuda.current_device()
        src_coord, dtype, replicate_pgs = key
        alignment = 512  # bytes

        local_coord = tuple(