Diff Coverage

Source File	Diff Coverage (%)	Missing Lines
hyper_parallel/core/context_parallel/__init__.py	100%
hyper_parallel/core/context_parallel/async_context_parallel.py	63.3%	50-51,72,94-98,105,120-124,233-235,237,248-252,260,272,275,294,307,316,325,363-364,368,377-378,381,387,392,398-399,421,455,513,651-654
hyper_parallel/core/context_parallel/async_dsa_context_parallel.py	85.7%	61-63,83-84,92,98,103,106,110-111,116,122,131
hyper_parallel/core/context_parallel/dsa_context_parallel.py	98.1%	507
hyper_parallel/platform/mindspore/platform.py	21.2%	83,88-92,97-104,121-123,127-128,133-135,139-140,155-160,167-172,690-696,701-704,708-709,715-719,1233-1237,1241,1260-1264,1268,1277
hyper_parallel/platform/platform.py	66.7%	592,608,657
hyper_parallel/platform/torch/platform.py	85.2%	75-76,84-88,153

hyper_parallel/core/context_parallel/async_context_parallel.py

# ---------------------------------------------------------------------------

def _detach_if_available(tensor: Tensor) -> Tensor:
    """Detach the communication buffer when the backend tensor exposes ``detach``."""
    detach = getattr(tensor, "detach", None)
    return detach() if detach is not None else tensor


def _launch_async_a2a_seq_to_head(
    tensor: Tensor,

        shape[:head_dim] + [world_size, num_heads // world_size] + shape[head_dim + 1:]
    ).permute(
        [head_dim] + list(range(head_dim)) + list(range(head_dim + 1, ndim))
    ).contiguous()
    out_perm, work = platform.all_to_all_single(_detach_if_available(x_perm), list(x_perm.shape), group, async_op=True)
    return work, out_perm


def _a2a_reconstruct(out_perm: Tensor, concat_dim: int) -> Tensor:



def _move_dim_to_front(tensor: Tensor, dim: int) -> Tensor:
    """Move ``dim`` to the leading dimension before all-gather/reduce-scatter."""
    dim = _normalize_dim(dim, tensor.dim())
    if dim == 0:
        return tensor.contiguous()
    perm = [dim] + [i for i in range(tensor.dim()) if i != dim]
    return tensor.permute(perm).contiguous()


def _move_dim_from_front(tensor: Tensor, dim: int) -> Tensor:
    """Inverse of :func:`_move_dim_to_front`."""

def _move_dim_from_front(tensor: Tensor, dim: int) -> Tensor:
    """Inverse of :func:`_move_dim_to_front`."""
    dim = _normalize_dim(dim, tensor.dim())
    if dim == 0:
        return tensor.contiguous()
    perm = [dim] + [i for i in range(tensor.dim()) if i != dim]
    inverse = [0] * len(perm)
    for idx, value in enumerate(perm):
        inverse[value] = idx

    world_size: int,
    gather_dim: int,
) -> tuple:
    """Launch async all-gather along ``gather_dim``."""
    x_perm = _move_dim_to_front(tensor.contiguous(), gather_dim)
    output_shape = list(x_perm.shape)
    output_shape[0] *= world_size
    out_perm, work = platform.all_gather_single(_detach_if_available(x_perm), output_shape, group, async_op=True)
    return work, out_perm


def _allgather_reconstruct(out_perm: Tensor, gather_dim: int) -> Tensor:
    """Move the leading communication buffer dimension back to ``gather_dim``.

            )
        co = cp_size // ds

        if ds == 1:
            if self.load_balance:
                return super().apply(module, device_mesh)
            return self._apply_colossal_async(module, device_mesh, cp_size, k_proj, v_proj)

        return self._apply_a2a_async(module, device_mesh, ds, co, q_proj, k_proj, v_proj)

    def _apply_colossal_async(
        self,
        module: Module,

        k_proj: Module,
        v_proj: Module,
    ) -> Module:
        """Register Pure Colossal async K/V AllGather hooks."""
        co_submesh = _ensure_1d(device_mesh)
        group = co_submesh.get_group()
        fwd_ag_slots = {"k": None, "v": None}
        bwd_ag_slots = {"k": [], "v": []}
        self._register_ag_proj_hooks(
            k_proj,
            v_proj,
            group=group,
            world_size=cp_size,

            world_size=cp_size,
            fwd_slots=fwd_ag_slots,
            bwd_slots=bwd_ag_slots,
        )
        platform.register_forward_pre_hook(
            module,
            partial(
                self._attn_pre_hook_colossal,
                co_submesh=co_submesh,

                bwd_slots=bwd_ag_slots,
            ),
            with_kwargs=True,
        )
        module.register_forward_hook(
            partial(self._post_hook_colossal, co_submesh=co_submesh)
        )
        return module

    def _apply_a2a_async(  # pylint: disable=too-many-arguments
        self,
        module: Module,


        if co == 1:
            ds_submesh = _ensure_1d(device_mesh)
            group = ds_submesh.get_group()
            pre_hook = partial(
                self._attn_pre_hook_ulysses,
                group=group,
                world_size=ds,
                fwd_slots=fwd_slots,

            dim_names = two_d_mesh.mesh_dim_names
            assert dim_names is not None, "2-D mesh must have mesh_dim_names (guaranteed by _build_2d_mesh)"
            ds_submesh = two_d_mesh[dim_names[1]]
            group = ds_submesh.get_group()
            pre_hook = partial(
                self._attn_pre_hook_hybrid,
                group=group,
                world_size=ds,
                two_d_mesh=two_d_mesh,

                fwd_slots=fwd_slots,
                bwd_slots=bwd_slots,
            )

        self._register_proj_hooks(
            q_proj,
            k_proj,
            v_proj,
            group=group,

            world_size=ds,
            fwd_slots=fwd_slots,
            bwd_slots=bwd_slots,
        )
        platform.register_forward_pre_hook(
            module,
            pre_hook,
            with_kwargs=True,
        )

        return output

    def _register_ag_proj_hooks(self, k_proj, v_proj, group, world_size, fwd_slots, bwd_slots):
        """Register async AllGather hooks for K/V projection modules."""
        for key, proj in [("k", k_proj), ("v", v_proj)]:
            proj.register_forward_hook(
                partial(self._proj_ag_post_hook, key=key, group=group, world_size=world_size,
                        fwd_slots=fwd_slots)
            )
            platform.register_full_backward_pre_hook(
                proj,
                partial(self._proj_ag_bwd_pre_hook, bwd_slot=bwd_slots[key])
            )

    def _proj_ag_post_hook(  # pylint: disable=unused-argument,too-many-arguments
        self, module, inputs, output, key, group, world_size, fwd_slots
    ):
        """Launch async K/V AllGather after projection; return original output."""
        tensor = output.to_local() if isinstance(output, DTensor) else output
        fwd_slots[key] = _launch_async_allgather_seq(
            tensor, group, world_size, self.seq_dim
        )
        return output

    def _get_qkv_value(self, args, kwargs, qkv_pos: int):
        """Return Q/K/V value from positional args or configured kwargs."""
        idx = self.qkv_indices[qkv_pos]

    def _get_qkv_value(self, args, kwargs, qkv_pos: int):
        """Return Q/K/V value from positional args or configured kwargs."""
        idx = self.qkv_indices[qkv_pos]
        if idx < len(args):
            return args[idx]
        if qkv_pos < len(self.qkv_kwarg_names):
            name = self.qkv_kwarg_names[qkv_pos]
            if name in kwargs:
                return kwargs[name]
        return None

    def _set_qkv_value(self, args, kwargs, qkv_pos: int, value):
        """Set Q/K/V value in positional args or configured kwargs."""
        idx = self.qkv_indices[qkv_pos]

    def _set_qkv_value(self, args, kwargs, qkv_pos: int, value):
        """Set Q/K/V value in positional args or configured kwargs."""
        idx = self.qkv_indices[qkv_pos]
        if idx < len(args):
            args[idx] = value
            return
        if qkv_pos < len(self.qkv_kwarg_names):
            name = self.qkv_kwarg_names[qkv_pos]
            if name in kwargs:
                kwargs[name] = value

        )

    def _wait_allgather(self, tensor, group, world_size, work, out_perm, bwd_slot=None):
        """Wait for pre-launched AllGather and return gathered tensor."""
        return platform.differentiable_async_allgather_wait(
            tensor,
            work,
            out_perm,
            group,

        transforms = (transform_q, transform_k, transform_v)
        for pos, transform in enumerate(transforms):
            value = self._get_qkv_value(new_args, new_kwargs, pos)
            if value is None:
                continue
            self._set_qkv_value(
                new_args,
                new_kwargs,
                pos,


        for pos, key in enumerate(("k", "v"), start=1):
            value = self._get_qkv_value(new_args, new_kwargs, pos)
            if value is None:
                continue
            local = _to_local(value)
            work, out_perm = fwd_slots[key]
            fwd_slots[key] = None
            gathered = self._wait_allgather(

        return (d_seq,) + grad_output[1:] if isinstance(grad_output, tuple) else (d_seq,)

    def _proj_ag_bwd_pre_hook(self, module, grad_output, bwd_slot):  # pylint: disable=unused-argument
        """Wait backward reduce-scatter just before K/V projection GEMM."""
        work, out_perm, gather_dim = bwd_slot.pop()
        work.wait()
        d_local = _allgather_reconstruct(out_perm, gather_dim)
        return (d_local,) + grad_output[1:] if isinstance(grad_output, tuple) else (d_local,)

hyper_parallel/core/context_parallel/async_dsa_context_parallel.py

    @staticmethod
    def _extract_tensor_output(output: Any) -> Any:
        if _is_tensor_or_dtensor(output):
            return output
        if isinstance(output, (tuple, list)) and len(output) == 1 and _is_tensor_or_dtensor(output[0]):
            return output[0]
        return None

    @staticmethod
    def _local_tensor(value: Any) -> Any:
        return value.to_local() if isinstance(value, DTensor) else value

                self.launch(slot_name, tensor)
            return output

        def _backward_pre_hook(hook_module, grad_output):
            del hook_module
            return self._producer_bwd_pre_hook(grad_output, bwd_slot)

        module.register_forward_hook(_post_hook)
        platform.register_full_backward_pre_hook(module, _backward_pre_hook)


    def _producer_bwd_pre_hook(self, grad_output: Any, bwd_slot: list) -> Any:
        """Wait deferred reduce-scatter before gradients cross the producer boundary."""
        if not bwd_slot:
            return grad_output
        work, out_perm, gather_dim = bwd_slot.pop()
        work.wait()
        d_local = _allgather_reconstruct(out_perm, gather_dim)
        if isinstance(grad_output, tuple):

        work.wait()
        d_local = _allgather_reconstruct(out_perm, gather_dim)
        if isinstance(grad_output, tuple):
            return (d_local,) + grad_output[1:]
        return (d_local,)

    def launch(self, slot_name: str, value: Any) -> None:
        """Launch all-gather for ``value`` and enqueue its handle."""
        if not _is_tensor_or_dtensor(value):
            return
        local = self._local_tensor(value)
        if not platform.is_tensor(local):
            return
        if self.world_size <= 1:
            self._slots.setdefault(slot_name, []).append((local, None, None))
            return
        work, out_perm = _launch_async_allgather_seq(local, self.group, self.world_size, self.seq_dim)
        self._slots.setdefault(slot_name, []).append((local, work, out_perm))

    def wait(self, slot_name: str, value: Any) -> Any:
        """Wait on a pre-launched gather, or fall back to consumer-local launch."""
        if not _is_tensor_or_dtensor(value):
            return value
        slot = self._slots.get(slot_name)
        if slot:
            local, work, out_perm = slot.pop(0)
            if work is None:

        if slot:
            local, work, out_perm = slot.pop(0)
            if work is None:
                return DTensor.from_local(local, self.device_mesh, (Replicate(),))
            gathered = platform.differentiable_async_allgather_wait(
                local,
                work,
                out_perm,
                self.group,

                self.world_size,
                self.seq_dim,
                self._bwd_slots.setdefault(slot_name, []),
            )
            return DTensor.from_local(gathered, self.device_mesh, (Replicate(),))
        return _to_sequence_replicate(value, self.device_mesh, self.seq_dim)


class AsyncDSAIndexerContextParallel(DSAIndexerContextParallel):

hyper_parallel/core/context_parallel/dsa_context_parallel.py

        if self.key_indexer_index is not None and self.key_indexer_index < len(args):
            return self._local_shape(args[self.key_indexer_index])
        if self.key_indexer_kwarg_name and self.key_indexer_kwarg_name in kwargs:
            return self._local_shape(kwargs[self.key_indexer_kwarg_name])
        return None

    @staticmethod
    def _get_local_idx(cp_mesh: DeviceMesh) -> int:
        """Return current rank's index in the CP mesh rank list."""

hyper_parallel/platform/mindspore/platform.py



def _normalize_dim(dim: int, ndim: int) -> int:
    """Normalize a possibly-negative dimension index."""
    return dim + ndim if dim < 0 else dim


def _move_dim_to_front(tensor: Tensor, dim: int) -> Tensor:
    """Move ``dim`` to the front while preserving the other dimensions' order."""
    dim = _normalize_dim(dim, tensor.dim())
    if dim == 0:
        return tensor.contiguous()
    perm = [dim] + [i for i in range(tensor.dim()) if i != dim]
    return tensor.permute(perm).contiguous()


def _move_dim_from_front(tensor: Tensor, dim: int) -> Tensor:
    """Inverse of :func:`_move_dim_to_front`."""
    dim = _normalize_dim(dim, tensor.dim())
    if dim == 0:
        return tensor.contiguous()
    perm = [dim] + [i for i in range(tensor.dim()) if i != dim]
    inverse = [0] * len(perm)
    for idx, value in enumerate(perm):
        inverse[value] = idx
    return tensor.permute(inverse).contiguous()


def _normalize_all_to_all_single_result(result, output: Tensor) -> tuple[Tensor, object]:
    """Normalize MindSpore all_to_all_single return values to ``(output, handle)``."""



def _normalize_all_gather_single_result(result, output: Tensor) -> tuple[Tensor, object]:
    """Normalize MindSpore all_gather_into_tensor return values to ``(output, handle)``."""
    if isinstance(result, tuple):
        if len(result) != 2:
            raise ValueError(
                "mindspore all_gather_into_tensor returned an unexpected tuple "
                f"with length {len(result)}"
            )
        return result
    return output, result


def _normalize_reduce_scatter_single_result(result, output: Tensor) -> tuple[Tensor, object]:
    """Normalize MindSpore reduce_scatter_tensor return values to ``(output, handle)``."""
    if isinstance(result, tuple):
        if len(result) != 2:
            raise ValueError(
                "mindspore reduce_scatter_tensor returned an unexpected tuple "
                f"with length {len(result)}"
            )
        return result
    return output, result


def _mindspore_all_to_all_single(input_tensor: Tensor, output_shape, group, async_op=False) -> tuple[Tensor, object]:
    """Launch MindSpore all_to_all_single and normalize return values."""



def _mindspore_all_gather_single(input_tensor: Tensor, output_shape, group, async_op=False) -> tuple[Tensor, object]:
    """Launch MindSpore all_gather_into_tensor and normalize return values."""
    output = mint.empty(tuple(output_shape), dtype=input_tensor.dtype)
    result = ops_comm.all_gather_into_tensor(output, input_tensor, group=group, async_op=async_op)
    normalized_output, handle = _normalize_all_gather_single_result(result, output)
    if not async_op:
        return normalized_output, None
    return normalized_output, handle


def _mindspore_reduce_scatter_single(
        input_tensor: Tensor, output_shape, group, async_op=False

def _mindspore_reduce_scatter_single(
        input_tensor: Tensor, output_shape, group, async_op=False
) -> tuple[Tensor, object]:
    """Launch MindSpore reduce_scatter_tensor and normalize return values."""
    output = mint.empty(tuple(output_shape), dtype=input_tensor.dtype)
    result = ops_comm.reduce_scatter_tensor(output, input_tensor, group=group, async_op=async_op)
    normalized_output, handle = _normalize_reduce_scatter_single_result(result, output)
    if not async_op:
        return normalized_output, None
    return normalized_output, handle


class AsyncCollectiveTensor(Tensor):
    """MindSpore Tensor subclass that defers ``CommHandle.wait()`` to


    @staticmethod
    def forward(ctx, x, work, out_perm, group, world_size, gather_dim, handle_box):  # pylint: disable=arguments-differ
        """Wait for pre-launched all-gather and reconstruct the gathered tensor."""
        ctx.group = group
        ctx.world_size = world_size
        ctx.gather_dim = gather_dim
        ctx.handle_box = handle_box
        ctx.x_shape = tuple(x.shape)
        work.wait()
        return _move_dim_from_front(out_perm, gather_dim)

    @staticmethod
    def backward(ctx, grad_output):
        """Launch reverse reduce-scatter for the all-gather."""
        grad_perm = _move_dim_to_front(grad_output.contiguous(), ctx.gather_dim)
        output_shape = list(grad_perm.shape)
        if output_shape[0] % ctx.world_size != 0:
            raise ValueError(
                "all_gather backward expected gathered dimension to be divisible by world_size, "
                f"got {output_shape[0]} and {ctx.world_size}."
            )
        output_shape[0] //= ctx.world_size
        output, work = _mindspore_reduce_scatter_single(
            grad_perm,
            output_shape,
            ctx.group,
            async_op=True,

            output_shape,
            ctx.group,
            async_op=True,
        )
        if ctx.handle_box is not None:
            ctx.handle_box.append((work, output, ctx.gather_dim))
            return mint.zeros(ctx.x_shape, dtype=grad_output.dtype), None, None, None, None, None, None
        work.wait()
        return _move_dim_from_front(output, ctx.gather_dim), None, None, None, None, None, None


class MindSporePlatform(Platform):
    """MindSpore platform api"""

        return group_name

    @staticmethod
    def all_gather_into_tensor(data, group_info, async_op=False):
        group_name = group_info if isinstance(group_info, str) else group_info.group_name
        rank_size = get_group_size(group_name) if isinstance(group_info, str) else group_info.rank_size
        output_shape = list(data.shape)
        output_shape[0] *= rank_size
        return _mindspore_all_gather_single(data, output_shape, group_name, async_op=async_op)

    @staticmethod
    def all_gather_single(input_tensor, output_shape, group, async_op=False):
        return _mindspore_all_gather_single(input_tensor, output_shape, group, async_op=async_op)

    @staticmethod
    def all_reduce(data, group_info, async_op=False):
        if isinstance(group_info, str):

        return data

    @staticmethod
    def reduce_scatter_tensor(data, group_info, async_op=False):
        group_name = group_info if isinstance(group_info, str) else group_info.group_name
        rank_size = get_group_size(group_name) if isinstance(group_info, str) else group_info.rank_size
        output_shape = list(data.shape)
        output_shape[0] //= rank_size
        return _mindspore_reduce_scatter_single(data, output_shape, group_name, async_op=async_op)

    @staticmethod
    def reduce_scatter_single(input_tensor, output_shape, group, async_op=False):
        return _mindspore_reduce_scatter_single(input_tensor, output_shape, group, async_op=async_op)

    @staticmethod
    def all_to_all_single(input_tensor, output_shape, group, async_op=False):
        return _mindspore_all_to_all_single(input_tensor, output_shape, group, async_op=async_op)


    @staticmethod
    def differentiable_async_allgather_wait(x, work, out_perm, group, world_size, gather_dim,
                                            handle_box=None):
        return _MSAsyncAllGatherFunction.apply(
            x, work, out_perm, group, world_size, gather_dim, handle_box
        )

    @staticmethod