Diff Coverage

Diff: origin/master...HEAD, staged and unstaged changes

Source File Diff Coverage (%) Missing Lines
hyper_parallel/core/moe_utils.py 98.4% 115
hyper_parallel/platform/torch/common/moe.py 100%  
hyper_parallel/trainer/callbacks/base.py 0.0% 228-231,667,669-670,673-676,683-684,686-688,693,695,698-699,701
hyper_parallel/trainer/config.py 0.0% 278-280
hyper_parallel/core/moe_utils.py
111
112
113
114
115
116
117
118
119

    if hasattr(group, "group"):
        return group

    return SimpleNamespace(group=group)


def _get_moe_layers(model: "nn.Module") -> list:
    """Collect all MoE sub-modules with ``enable_expert_bias=True``.
hyper_parallel/trainer/callbacks/base.py
224
225
226
227
228
229
230
231
232
233
234
235
                metrics["tflops"] = f"{observed_tflops:.1f}"
                metrics["mfu"] = f"{mfu * 100:.1f}%"

        # Include aux_loss from MoEMonitorCallback when available.
        moe_cb = self.trainer.moe_monitor_callback
        aux_loss = getattr(moe_cb, 'last_mean_aux_loss', None)
        if aux_loss is not None:
            metrics["aux_loss"] = f"{aux_loss:.6f}"

        logger.info_rank0(" | ".join(f"{k}={v}" for k, v in metrics.items()))

        record = {
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
        """Initialize MoEMonitorCallback from trainer config."""
        super().__init__(trainer)
        moe_cfg = getattr(trainer.args, 'moe_monitor', None)
        self.enabled = getattr(moe_cfg, 'enabled', False) if moe_cfg else False
        self._impl = None

        if self.enabled:
            from hyper_parallel.core.moe_utils import (  # pylint: disable=C0415
                MoEMonitorCallback as _CoreMoEMonitorCallback,
            )
            lr = getattr(moe_cfg, 'lr', 1e-3)
            every_n_steps = getattr(moe_cfg, 'every_n_steps', 1)
            num_recomputations = getattr(moe_cfg, 'num_recomputations', 1)
            self._impl = _CoreMoEMonitorCallback(
                model=self.trainer.model,
                lr=lr,
                every_n_steps=every_n_steps,
                num_recomputations=num_recomputations,
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
                every_n_steps=every_n_steps,
                num_recomputations=num_recomputations,
            )

    @property
    def last_mean_aux_loss(self) -> Optional[float]:
        """Mean aux_loss across MoE layers from the last ``on_step_end``."""
        if self._impl is not None:
            return self._impl.last_mean_aux_loss
        return None

    def on_train_begin(self, state: "TrainerState", **kwargs) -> None:
        """Log one-time confirmation when MoE monitoring is enabled."""
        if self.enabled and platform.get_rank() == 0:
            logger.info("MoEMonitorCallback: MoE expert-load monitoring enabled")

    def on_step_end(self, state: "TrainerState", *, loss: float = None,
                    grad_norm: float = None, **kwargs) -> None:
        """Delegate expert bias update to core MoEMonitorCallback."""
        if self._impl is not None:
            self._impl.on_step_end()

    def on_substep_end(self, state: "TrainerState", **kwargs) -> None:
        """No-op; expert bias updates happen in on_step_end."""

class GradientHealthCallback(Callback):
    """Detect NaN / Inf grad_norm and raise / warn.
hyper_parallel/trainer/config.py
274
275
276
277
278
279
280
281
282
283
284
        num_recomputations: Number of forward executions per optimizer step.
            Default ``1``. Set to ``2`` when activation checkpoint is enabled.
    """
    enabled: bool = False
    lr: float = 1e-3
    every_n_steps: int = 1
    num_recomputations: int = 1

@dataclass
class EvalConfig:
    """``train.eval.*`` — eval cadence + dataset."""