Diff Coverage

Diff: origin/master...HEAD, staged and unstaged changes

Source File Diff Coverage (%) Missing Lines
hyper_parallel/platform/torch/fully_shard/param.py 100%  
hyper_parallel/platform/torch/fully_shard/state.py 50.0% 269
hyper_parallel/trainer/base.py 0.0% 1214-1215
hyper_parallel/platform/torch/fully_shard/state.py
265
266
267
268
269
270
271
272
273

    def lazy_init(self):
        if self.is_shard and not self._reset_sharded_params:
            for hsdp_param in self.hsdp_params:
                hsdp_param.reset_sharded_param()
            self._reset_sharded_params = True
        self._validate_no_meta_params()
        self._validate_cpu_offload_params()
        self._init_mp_dtypes()
hyper_parallel/trainer/base.py
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
        # Re-tie weights — ``to_empty`` gives every nn.Parameter fresh
        # storage so ``__init__``-time ties are broken. Must happen before
        # ``lazy_init`` re-wraps params as DTensor (non-leaf), which would
        # cause ``register_parameter`` to reject the assignment.
        if hasattr(self.model, "tie_weights"):
            self.model.tie_weights()
        # ``to_empty`` strips DTensor; ``lazy_init`` re-wraps shards before
        # ``_load_weights`` / optimizer step see the params (the forward
        # pre-hook does the same later, but the loader needs DTensor first).
        reset_count = self._lazy_init_hsdp_modules()