Coverage for hyper_parallel/platform/torch/fully_shard/grad

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ============================================================================

15"""Torch HSDP gradient hook"""

16from hyper_parallel.core.fully_shard.hsdp_grad_hook import HSDPGradHook

19class TorchHSDPGradHook(HSDPGradHook):

20 """

21 Torch HSDP gradient hook for handling gradient operations in hybrid sharded data parallel training.

23 This class extends the base HSDPGradHook to provide PyTorch-specific gradient hook functionality,

24 including gradient casting, scaling and parameter gradient management.

25 """

27 def _get_final_grad_hook(self, param, grad_hook, no_cast=False):

28 """

29 Create a final gradient hook that adds casting and scaling operations.

31 Args:

32 param: The parameter tensor to apply the gradient hook to

33 grad_hook: The base gradient hook function

34 no_cast (bool): Whether to skip gradient casting operations, defaults to False

36 Returns:

37 function: A gradient hook function that processes gradients with casting, scaling,

38 and parameter gradient management

39 """

40 final_hook = super()._get_final_grad_hook(param, grad_hook, no_cast)

41 def set_grad_hook(grad):

42 final_grad = final_hook(grad)

43 grad.data = final_grad

44 param.grad = None

45 return grad

47 return set_grad_hook

Coverage for hyper_parallel / platform / torch / fully_shard / grad_hook.py: 0%