Diff Coverage

Source File	Diff Coverage (%)	Missing Lines
hyper_parallel/platform/torch/activation_checkpoint/__init__.py	100%
hyper_parallel/platform/torch/activation_checkpoint/recompute_session.py	90.3%	178,206,250,315,325,327-329,353,380,400,408,416,480,489-490,534
hyper_parallel/platform/torch/platform.py	47.1%	1344-1345,1396-1397,1409,1424-1425,1435-1436

hyper_parallel/platform/torch/activation_checkpoint/recompute_session.py

                activations.
        """
        frame = self._frame
        if frame.is_recomputed[session_id]:
            return

        args, kwargs = frame.get_inputs()

        try:

        # Null out handles in all live holders for this session.
        for weak_holder in frame.weak_holders:
            holder = weak_holder()
            if holder is not None and session_id in holder.handles:
                holder.handles[session_id] = None


# ---------------------------------------------------------------------------
# Context variables

    Yields:
        The ``session_id`` string that is active for the scope.
    """
    if session_id is None:
        session_id = str(uuid.uuid4())
    session = _RecomputeSession(session_id=session_id, retain_on_unpack=retain_on_unpack)
    token = _recompute_session.set(session)
    try:
        yield session_id

    ) -> None:
        def pack_hook(x: torch.Tensor) -> torch.Tensor:
            frame = target_frame_ref()
            if frame is None:
                raise RuntimeError(
                    "CheckpointFrame has been garbage collected during recomputation."
                )

            frame.recomp_counter[session_id] += 1


            # If recomputation produces more tensors than the original forward
            # saved, either silently ignore or error.
            if recomp_idx >= len(frame.weak_holders):
                if not frame.early_stop and not frame.forward_completed:
                    # Allow the extra tensor through without caching.
                    frame.ignore_saved_mismatch = True
                    return x
                raise RuntimeError(
                    "Recompute session: more tensors were saved during "
                    "recomputation than during the original forward pass."
                )


            return x

        def unpack_hook(x: torch.Tensor) -> torch.Tensor:
            return x

        super().__init__(pack_hook, unpack_hook)


        def unpack_hook(holder: _Holder) -> torch.Tensor:
            session = _recompute_session.get()
            if session is None:
                raise RuntimeError(
                    "checkpoint_with_session: unpack triggered outside a "
                    "recompute session context.  Wrap backward in "
                    "_recompute_session_ctx()."
                )

                    pass
                frame.is_recomputed[key] = True

            if key not in holder.handles:
                raise RuntimeError(
                    f"checkpoint_with_session: session '{key}' has no handle "
                    "for this holder.  The recomputation may have saved a "
                    "different number of tensors than the original forward."
                )

                )

            handle = holder.handles[key]
            if handle is None:
                raise RuntimeError(
                    "checkpoint_with_session: unpack triggered for a tensor "
                    "that has already been unpacked once in this session.  "
                    "If you need to access the tensor multiple times, use "
                    "retain_on_unpack=True."

                    "retain_on_unpack=True."
                )

            if handle not in frame.recomputed[key]:
                raise RuntimeError(
                    "checkpoint_with_session: handle not found in recomputed "
                    f"cache for session '{key}'."
                )

    Raises:
        ValueError: If ``use_reentrant=True``.
    """
    if use_reentrant:
        raise ValueError(
            "checkpoint_with_session does not support use_reentrant=True.  "
            "Session-based checkpointing requires the non-reentrant path."
        )

    session = _recompute_session.get()

    # If no session is active, fall back to vanilla torch checkpoint.
    if session is None:
        from torch.utils.checkpoint import checkpoint as _torch_checkpoint  # pylint: disable=C0415
        return _torch_checkpoint(
            function, *args, use_reentrant=False, **kwargs
        )

    # -- Session is active: use our custom machinery. ------------------------

    if isinstance(value, torch.Tensor):
        detached = value.detach()
        detached.requires_grad = value.requires_grad
        return detached
    return value

hyper_parallel/platform/torch/platform.py


    @property
    def checkpoint(self):
        # pylint: disable=C0415
        from hyper_parallel.platform.torch.activation_checkpoint.recompute_session import checkpoint_with_session
        return checkpoint_with_session

    @staticmethod
    def checkpoint_wrapper(module, **checkpoint_kwargs):
        # pylint: disable=C0415

            A list populated with one opaque recompute handle per checkpointed
            block executed during the forward pass within the context.
        """
        # pylint: disable=C0415
        from hyper_parallel.platform.torch.activation_checkpoint.recompute_session import _recompute_handle_collector_ctx
        return _recompute_handle_collector_ctx()

    @staticmethod
    def recompute_handle(handle, session_id):
        """Eagerly fire one checkpointed block's forward re-run.

                :meth:`recompute_handle_collector_ctx`.
            session_id: Stable key shared by the producing re-run and the
                consuming backward.
        """
        return handle.recompute(session_id)

    @staticmethod
    def recompute_session_ctx(session_id=None, retain_on_unpack=False):
        """Context manager binding recompute unpack to a caller-provided session.

        Returns:
            A context manager activating the session for its scope.
        """
        # pylint: disable=C0415
        from hyper_parallel.platform.torch.activation_checkpoint.recompute_session import _recompute_session_ctx
        return _recompute_session_ctx(session_id=session_id, retain_on_unpack=retain_on_unpack)

    @staticmethod
    def clear_recompute_session(session_id):
        """Release retained recompute data for a session.

        Args:
            session_id: The session key whose cached recompute data is cleared.
        """
        # pylint: disable=C0415
        from hyper_parallel.platform.torch.activation_checkpoint.recompute_session import _clear_recompute_session
        return _clear_recompute_session(session_id)

    @staticmethod
    def get_element_size(tensor):
        """Get Tensor Element Size"""